Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- opencompass/.github/workflows/link-check.yml +26 -0
- opencompass/docs/en/.readthedocs.yaml +17 -0
- opencompass/docs/en/Makefile +20 -0
- opencompass/docs/en/advanced_guides/accelerator_intro.md +142 -0
- opencompass/docs/en/advanced_guides/circular_eval.md +113 -0
- opencompass/docs/en/advanced_guides/code_eval.md +104 -0
- opencompass/docs/en/advanced_guides/code_eval_service.md +224 -0
- opencompass/docs/en/advanced_guides/contamination_eval.md +124 -0
- opencompass/docs/en/advanced_guides/custom_dataset.md +149 -0
- opencompass/docs/en/advanced_guides/evaluation_lightllm.md +71 -0
- opencompass/docs/en/advanced_guides/evaluation_lmdeploy.md +88 -0
- opencompass/docs/en/advanced_guides/longeval.md +169 -0
- opencompass/docs/en/advanced_guides/needleinahaystack_eval.md +197 -0
- opencompass/docs/en/advanced_guides/new_dataset.md +57 -0
- opencompass/docs/en/advanced_guides/new_model.md +73 -0
- opencompass/docs/en/advanced_guides/objective_judgelm_evaluation.md +186 -0
- opencompass/docs/en/advanced_guides/prompt_attack.md +108 -0
- opencompass/docs/en/advanced_guides/subjective_evaluation.md +171 -0
- opencompass/docs/en/conf.py +222 -0
- opencompass/docs/en/docutils.conf +2 -0
- opencompass/docs/en/get_started/faq.md +128 -0
- opencompass/docs/en/get_started/installation.md +141 -0
- opencompass/docs/en/get_started/quick_start.md +300 -0
- opencompass/docs/en/index.rst +94 -0
- opencompass/docs/en/prompt/meta_template.md +263 -0
- opencompass/docs/en/tools.md +133 -0
- opencompass/docs/zh_cn/.readthedocs.yaml +17 -0
- opencompass/docs/zh_cn/Makefile +20 -0
- opencompass/docs/zh_cn/_static/css/readthedocs.css +62 -0
- opencompass/docs/zh_cn/_static/image/logo.svg +79 -0
- opencompass/docs/zh_cn/_static/image/logo_icon.svg +31 -0
- opencompass/docs/zh_cn/_static/js/custom.js +10 -0
- opencompass/docs/zh_cn/_templates/404.html +18 -0
- opencompass/docs/zh_cn/_templates/autosummary/class.rst +13 -0
- opencompass/docs/zh_cn/_templates/callable.rst +14 -0
- opencompass/docs/zh_cn/advanced_guides/accelerator_intro.md +142 -0
- opencompass/docs/zh_cn/advanced_guides/circular_eval.md +111 -0
- opencompass/docs/zh_cn/advanced_guides/code_eval.md +106 -0
- opencompass/docs/zh_cn/advanced_guides/code_eval_service.md +222 -0
- opencompass/docs/zh_cn/advanced_guides/compassbench_intro.md +194 -0
- opencompass/docs/zh_cn/advanced_guides/compassbench_v2_0.md +48 -0
- opencompass/docs/zh_cn/advanced_guides/contamination_eval.md +122 -0
- opencompass/docs/zh_cn/advanced_guides/custom_dataset.md +147 -0
- opencompass/docs/zh_cn/advanced_guides/evaluation_lightllm.md +71 -0
- opencompass/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md +86 -0
- opencompass/docs/zh_cn/advanced_guides/longeval.md +169 -0
- opencompass/docs/zh_cn/advanced_guides/needleinahaystack_eval.md +195 -0
- opencompass/docs/zh_cn/advanced_guides/new_dataset.md +58 -0
- opencompass/docs/zh_cn/advanced_guides/new_model.md +73 -0
- opencompass/docs/zh_cn/advanced_guides/objective_judgelm_evaluation.md +186 -0
opencompass/.github/workflows/link-check.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: 'Link check'
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
# check links at 01:30 a.m. every day
|
| 6 |
+
- cron: '30 1 * * *'
|
| 7 |
+
|
| 8 |
+
workflow_dispatch: # allow manual trigger
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
link-check:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
steps:
|
| 14 |
+
# - uses: actions/checkout@v3
|
| 15 |
+
|
| 16 |
+
- name: Install linkchecker
|
| 17 |
+
run: |
|
| 18 |
+
pip install linkchecker
|
| 19 |
+
|
| 20 |
+
- name: Run linkchecker
|
| 21 |
+
run: |
|
| 22 |
+
linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings \
|
| 23 |
+
--ignore-url "https://opencompass.readthedocs.io/.*/static/images/opencompass_logo.svg" \
|
| 24 |
+
--ignore-url "https://opencompass.readthedocs.io/.*/_static/images/icon-menu-dots.svg" \
|
| 25 |
+
--ignore-url "https://opencompass.readthedocs.io/policy" \
|
| 26 |
+
--ignore-url "https://opencompass.readthedocs.io/(en|zh_CN)/[0-9a-f]{40}/.*"
|
opencompass/docs/en/.readthedocs.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: 2
|
| 2 |
+
|
| 3 |
+
# Set the version of Python and other tools you might need
|
| 4 |
+
build:
|
| 5 |
+
os: ubuntu-22.04
|
| 6 |
+
tools:
|
| 7 |
+
python: "3.8"
|
| 8 |
+
|
| 9 |
+
formats:
|
| 10 |
+
- epub
|
| 11 |
+
|
| 12 |
+
sphinx:
|
| 13 |
+
configuration: docs/en/conf.py
|
| 14 |
+
|
| 15 |
+
python:
|
| 16 |
+
install:
|
| 17 |
+
- requirements: requirements/docs.txt
|
opencompass/docs/en/Makefile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal makefile for Sphinx documentation
|
| 2 |
+
#
|
| 3 |
+
|
| 4 |
+
# You can set these variables from the command line, and also
|
| 5 |
+
# from the environment for the first two.
|
| 6 |
+
SPHINXOPTS ?=
|
| 7 |
+
SPHINXBUILD ?= sphinx-build
|
| 8 |
+
SOURCEDIR = .
|
| 9 |
+
BUILDDIR = _build
|
| 10 |
+
|
| 11 |
+
# Put it first so that "make" without argument is like "make help".
|
| 12 |
+
help:
|
| 13 |
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
| 14 |
+
|
| 15 |
+
.PHONY: help Makefile
|
| 16 |
+
|
| 17 |
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
| 18 |
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
| 19 |
+
%: Makefile
|
| 20 |
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
opencompass/docs/en/advanced_guides/accelerator_intro.md
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Accelerate Evaluation Inference with vLLM or LMDeploy
|
| 2 |
+
|
| 3 |
+
## Background
|
| 4 |
+
|
| 5 |
+
During the OpenCompass evaluation process, the Huggingface transformers library is used for inference by default. While this is a very general solution, there are scenarios where more efficient inference methods are needed to speed up the process, such as leveraging VLLM or LMDeploy.
|
| 6 |
+
|
| 7 |
+
- [LMDeploy](https://github.com/InternLM/lmdeploy) is a toolkit designed for compressing, deploying, and serving large language models (LLMs), developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams.
|
| 8 |
+
- [vLLM](https://github.com/vllm-project/vllm) is a fast and user-friendly library for LLM inference and serving, featuring advanced serving throughput, efficient PagedAttention memory management, continuous batching of requests, fast model execution via CUDA/HIP graphs, quantization techniques (e.g., GPTQ, AWQ, SqueezeLLM, FP8 KV Cache), and optimized CUDA kernels.
|
| 9 |
+
|
| 10 |
+
## Preparation for Acceleration
|
| 11 |
+
|
| 12 |
+
First, check whether the model you want to evaluate supports inference acceleration using vLLM or LMDeploy. Additionally, ensure you have installed vLLM or LMDeploy as per their official documentation. Below are the installation methods for reference:
|
| 13 |
+
|
| 14 |
+
### LMDeploy Installation Method
|
| 15 |
+
|
| 16 |
+
Install LMDeploy using pip (Python 3.8+) or from [source](https://github.com/InternLM/lmdeploy/blob/main/docs/en/build.md):
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
pip install lmdeploy
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
### VLLM Installation Method
|
| 23 |
+
|
| 24 |
+
Install vLLM using pip or from [source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
pip install vllm
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## Accelerated Evaluation Using VLLM or LMDeploy
|
| 31 |
+
|
| 32 |
+
### Method 1: Using Command Line Parameters to Change the Inference Backend
|
| 33 |
+
|
| 34 |
+
OpenCompass offers one-click evaluation acceleration. During evaluation, it can automatically convert Huggingface transformer models to VLLM or LMDeploy models for use. Below is an example code for evaluating the GSM8k dataset using the default Huggingface version of the llama3-8b-instruct model:
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
# eval_gsm8k.py
|
| 38 |
+
from mmengine.config import read_base
|
| 39 |
+
|
| 40 |
+
with read_base():
|
| 41 |
+
# Select a dataset list
|
| 42 |
+
from .datasets.gsm8k.gsm8k_0shot_gen_a58960 import gsm8k_datasets as datasets
|
| 43 |
+
# Select an interested model
|
| 44 |
+
from ..models.hf_llama.hf_llama3_8b_instruct import models
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
Here, `hf_llama3_8b_instruct` specifies the original Huggingface model configuration, as shown below:
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
from opencompass.models import HuggingFacewithChatTemplate
|
| 51 |
+
|
| 52 |
+
models = [
|
| 53 |
+
dict(
|
| 54 |
+
type=HuggingFacewithChatTemplate,
|
| 55 |
+
abbr='llama-3-8b-instruct-hf',
|
| 56 |
+
path='meta-llama/Meta-Llama-3-8B-Instruct',
|
| 57 |
+
max_out_len=1024,
|
| 58 |
+
batch_size=8,
|
| 59 |
+
run_cfg=dict(num_gpus=1),
|
| 60 |
+
stop_words=['<|end_of_text|>', '<|eot_id|>'],
|
| 61 |
+
)
|
| 62 |
+
]
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
To evaluate the GSM8k dataset using the default Huggingface version of the llama3-8b-instruct model, use:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
python run.py config/eval_gsm8k.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
To accelerate the evaluation using vLLM or LMDeploy, you can use the following script:
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
python run.py config/eval_gsm8k.py -a vllm
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
or
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
python run.py config/eval_gsm8k.py -a lmdeploy
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### Method 2: Accelerating Evaluation via Deployed Inference Acceleration Service API
|
| 84 |
+
|
| 85 |
+
OpenCompass also supports accelerating evaluation by deploying vLLM or LMDeploy inference acceleration service APIs. Follow these steps:
|
| 86 |
+
|
| 87 |
+
1. Install the openai package:
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
pip install openai
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
2. Deploy the inference acceleration service API for vLLM or LMDeploy. Below is an example for LMDeploy:
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name Meta-Llama-3-8B-Instruct --server-port 23333
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
Parameters for starting the api_server can be checked using `lmdeploy serve api_server -h`, such as --tp for tensor parallelism, --session-len for the maximum context window length, --cache-max-entry-count for adjusting the k/v cache memory usage ratio, etc.
|
| 100 |
+
|
| 101 |
+
3. Once the service is successfully deployed, modify the evaluation script by changing the model configuration path to the service address, as shown below:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from opencompass.models import OpenAISDK
|
| 105 |
+
|
| 106 |
+
api_meta_template = dict(
|
| 107 |
+
round=[
|
| 108 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 109 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 110 |
+
],
|
| 111 |
+
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
models = [
|
| 115 |
+
dict(
|
| 116 |
+
abbr='Meta-Llama-3-8B-Instruct-LMDeploy-API',
|
| 117 |
+
type=OpenAISDK,
|
| 118 |
+
key='EMPTY', # API key
|
| 119 |
+
openai_api_base='http://0.0.0.0:23333/v1', # Service address
|
| 120 |
+
path='Meta-Llama-3-8B-Instruct', # Model name for service request
|
| 121 |
+
tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', # The tokenizer name or path, if set to `None`, uses the default `gpt-4` tokenizer
|
| 122 |
+
rpm_verbose=True, # Whether to print request rate
|
| 123 |
+
meta_template=api_meta_template, # Service request template
|
| 124 |
+
query_per_second=1, # Service request rate
|
| 125 |
+
max_out_len=1024, # Maximum output length
|
| 126 |
+
max_seq_len=4096, # Maximum input length
|
| 127 |
+
temperature=0.01, # Generation temperature
|
| 128 |
+
batch_size=8, # Batch size
|
| 129 |
+
retry=3, # Number of retries
|
| 130 |
+
)
|
| 131 |
+
]
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## Acceleration Effect and Performance Comparison
|
| 135 |
+
|
| 136 |
+
Below is a comparison table of the acceleration effect and performance when using VLLM or LMDeploy on a single A800 GPU for evaluating the Llama-3-8B-Instruct model on the GSM8k dataset:
|
| 137 |
+
|
| 138 |
+
| Inference Backend | Accuracy | Inference Time (minutes:seconds) | Speedup (relative to Huggingface) |
|
| 139 |
+
| ----------------- | -------- | -------------------------------- | --------------------------------- |
|
| 140 |
+
| Huggingface | 74.22 | 24:26 | 1.0 |
|
| 141 |
+
| LMDeploy | 73.69 | 11:15 | 2.2 |
|
| 142 |
+
| VLLM | 72.63 | 07:52 | 3.1 |
|
opencompass/docs/en/advanced_guides/circular_eval.md
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CircularEval
|
| 2 |
+
|
| 3 |
+
## Background
|
| 4 |
+
|
| 5 |
+
For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.
|
| 6 |
+
|
| 7 |
+
## Adding Your Own CircularEval Dataset
|
| 8 |
+
|
| 9 |
+
Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.
|
| 10 |
+
|
| 11 |
+
OpenCompass main library:
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
from opencompass.datasets.ceval import CEvalDataset
|
| 15 |
+
from opencompass.datasets.circular import CircularDatasetMeta
|
| 16 |
+
|
| 17 |
+
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
|
| 18 |
+
# The overloaded dataset class
|
| 19 |
+
dataset_class = CEvalDataset
|
| 20 |
+
|
| 21 |
+
# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
|
| 22 |
+
default_circular_splits = ['val', 'test']
|
| 23 |
+
|
| 24 |
+
# List of keys to be shuffled
|
| 25 |
+
default_option_keys = ['A', 'B', 'C', 'D']
|
| 26 |
+
|
| 27 |
+
# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
|
| 28 |
+
default_answer_key = 'answer'
|
| 29 |
+
|
| 30 |
+
# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
|
| 31 |
+
# def default_answer_key_switch_method(item, circular_pattern):
|
| 32 |
+
# # 'item' is the original data item
|
| 33 |
+
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
|
| 34 |
+
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
|
| 35 |
+
# return item
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:
|
| 39 |
+
|
| 40 |
+
- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
|
| 41 |
+
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.
|
| 42 |
+
|
| 43 |
+
Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:
|
| 44 |
+
|
| 45 |
+
- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
|
| 46 |
+
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
|
| 47 |
+
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.
|
| 48 |
+
|
| 49 |
+
OpenCompass configuration file:
|
| 50 |
+
|
| 51 |
+
```python
|
| 52 |
+
from mmengine.config import read_base
|
| 53 |
+
from opencompass.datasets.circular import CircularCEvalDataset
|
| 54 |
+
|
| 55 |
+
with read_base():
|
| 56 |
+
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
| 57 |
+
|
| 58 |
+
for d in ceval_datasets:
|
| 59 |
+
# Overloading the load method
|
| 60 |
+
d['type'] = CircularCEvalDataset
|
| 61 |
+
# Renaming for differentiation from non-circular evaluation versions
|
| 62 |
+
d['abbr'] = d['abbr'] + '-circular-4'
|
| 63 |
+
# Overloading the evaluation method
|
| 64 |
+
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
|
| 65 |
+
|
| 66 |
+
# The dataset after the above operations looks like this:
|
| 67 |
+
# dict(
|
| 68 |
+
# type=CircularCEvalDataset,
|
| 69 |
+
# path='./data/ceval/formal_ceval', # Unchanged
|
| 70 |
+
# name='computer_network', # Unchanged
|
| 71 |
+
# abbr='ceval-computer_network-circular-4',
|
| 72 |
+
# reader_cfg=dict(...), # Unchanged
|
| 73 |
+
# infer_cfg=dict(...), # Unchanged
|
| 74 |
+
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
|
| 75 |
+
# )
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Additionally, for better presentation of results in CircularEval, consider using the following summarizer:
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
from mmengine.config import read_base
|
| 84 |
+
from opencompass.summarizers import CircularSummarizer
|
| 85 |
+
|
| 86 |
+
with read_base():
|
| 87 |
+
from ...summarizers.groups.ceval.ceval_summary_groups
|
| 88 |
+
|
| 89 |
+
new_summary_groups = []
|
| 90 |
+
for item in ceval_summary_groups:
|
| 91 |
+
new_summary_groups.append(
|
| 92 |
+
{
|
| 93 |
+
'name': item['name'] + '-circular-4',
|
| 94 |
+
'subsets': [i + '-circular-4' for i in item['subsets']],
|
| 95 |
+
}
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
summarizer = dict(
|
| 99 |
+
type=CircularSummarizer,
|
| 100 |
+
# Select specific metrics to view
|
| 101 |
+
metric_types=['acc_origin', 'perf_circular'],
|
| 102 |
+
dataset_abbrs = [
|
| 103 |
+
'ceval-circular-4',
|
| 104 |
+
'ceval-humanities-circular-4',
|
| 105 |
+
'ceval-stem-circular-4',
|
| 106 |
+
'ceval-social-science-circular-4',
|
| 107 |
+
'ceval-other-circular-4',
|
| 108 |
+
],
|
| 109 |
+
summary_groups=new_summary_groups,
|
| 110 |
+
)
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
|
opencompass/docs/en/advanced_guides/code_eval.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Evaluation Tutorial
|
| 2 |
+
|
| 3 |
+
This tutorial primarily focuses on evaluating a model's coding proficiency, using `humaneval` and `mbpp` as examples.
|
| 4 |
+
|
| 5 |
+
## pass@1
|
| 6 |
+
|
| 7 |
+
If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md).
|
| 8 |
+
|
| 9 |
+
For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md).
|
| 10 |
+
|
| 11 |
+
## pass@k
|
| 12 |
+
|
| 13 |
+
If you need to generate multiple responses for a single example to evaluate the pass@k performance, consider the following two situations. Here we take 10 responses as an example:
|
| 14 |
+
|
| 15 |
+
### Typical Situation
|
| 16 |
+
|
| 17 |
+
For most models that support the `num_return_sequences` parameter in HF's generation, we can use it directly to obtain multiple responses. Refer to the following configuration file:
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
|
| 21 |
+
|
| 22 |
+
with read_base():
|
| 23 |
+
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
| 24 |
+
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
| 25 |
+
|
| 26 |
+
mbpp_datasets[0]['type'] = MBPPDatasetV2
|
| 27 |
+
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
| 28 |
+
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
|
| 29 |
+
|
| 30 |
+
datasets = []
|
| 31 |
+
datasets += humaneval_datasets
|
| 32 |
+
datasets += mbpp_datasets
|
| 33 |
+
|
| 34 |
+
models = [
|
| 35 |
+
dict(
|
| 36 |
+
type=HuggingFaceCausalLM,
|
| 37 |
+
...,
|
| 38 |
+
generation_kwargs=dict(
|
| 39 |
+
num_return_sequences=10,
|
| 40 |
+
do_sample=True,
|
| 41 |
+
top_p=0.95,
|
| 42 |
+
temperature=0.8,
|
| 43 |
+
),
|
| 44 |
+
...,
|
| 45 |
+
)
|
| 46 |
+
]
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
For `mbpp`, new changes are needed in the dataset and evaluation, so we simultaneously modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields to accommodate these requirements.
|
| 50 |
+
|
| 51 |
+
We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary. Note that we need to set `num_return_sequences` to get the number of responses.
|
| 52 |
+
|
| 53 |
+
Note: `num_return_sequences` must be greater than or equal to k, as pass@k itself is a probability estimate.
|
| 54 |
+
|
| 55 |
+
You can specifically refer to the following configuration file [configs/eval_code_passk.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk.py)
|
| 56 |
+
|
| 57 |
+
### For Models That Do Not Support Multiple Responses
|
| 58 |
+
|
| 59 |
+
This applies to some HF models with poorly designed APIs or missing features. In this case, we need to repeatedly construct datasets to achieve multiple response effects. Refer to the following configuration:
|
| 60 |
+
|
| 61 |
+
```python
|
| 62 |
+
from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
|
| 63 |
+
|
| 64 |
+
with read_base():
|
| 65 |
+
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
| 66 |
+
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
| 67 |
+
|
| 68 |
+
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
| 69 |
+
humaneval_datasets[0]['num_repeats'] = 10
|
| 70 |
+
mbpp_datasets[0]['abbr'] = 'mbpp_pass10'
|
| 71 |
+
mbpp_datasets[0]['num_repeats'] = 10
|
| 72 |
+
mbpp_datasets[0]['type'] = MBPPDatasetV2
|
| 73 |
+
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
| 74 |
+
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
|
| 75 |
+
|
| 76 |
+
datasets = []
|
| 77 |
+
datasets += humaneval_datasets
|
| 78 |
+
datasets += mbpp_datasets
|
| 79 |
+
|
| 80 |
+
models = [
|
| 81 |
+
dict(
|
| 82 |
+
type=HuggingFaceCausalLM,
|
| 83 |
+
...,
|
| 84 |
+
generation_kwargs=dict(
|
| 85 |
+
do_sample=True,
|
| 86 |
+
top_p=0.95,
|
| 87 |
+
temperature=0.8,
|
| 88 |
+
),
|
| 89 |
+
...,
|
| 90 |
+
)
|
| 91 |
+
]
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Since the dataset's prompt has not been modified, we need to replace the corresponding fields to achieve the purpose of repeating the dataset.
|
| 95 |
+
You need to modify these fields:
|
| 96 |
+
|
| 97 |
+
- `num_repeats`: the number of times the dataset is repeated
|
| 98 |
+
- `abbr`: It's best to modify the dataset abbreviation along with the number of repetitions because the number of datasets will change, preventing potential issues arising from discrepancies with the values in `.cache/dataset_size.json`.
|
| 99 |
+
|
| 100 |
+
For `mbpp`, modify the `type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` fields as well.
|
| 101 |
+
|
| 102 |
+
We also need model responses with randomness, thus setting the `generation_kwargs` parameter is necessary.
|
| 103 |
+
|
| 104 |
+
You can specifically refer to the following configuration file [configs/eval_code_passk_repeat_dataset.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk_repeat_dataset.py)
|
opencompass/docs/en/advanced_guides/code_eval_service.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code Evaluation Docker Tutorial
|
| 2 |
+
|
| 3 |
+
To complete the LLM code capability evaluation, we need to build a separate evaluation environment to avoid executing erroneous code in the development environment, which would inevitably cause losses. The code evaluation service currently used by OpenCompass can refer to the [code-evaluator](https://github.com/open-compass/code-evaluator) project. The following will introduce evaluation tutorials around the code evaluation service.
|
| 4 |
+
|
| 5 |
+
1. humaneval-x
|
| 6 |
+
|
| 7 |
+
This is a multi-programming language dataset [humaneval-x](https://huggingface.co/datasets/THUDM/humaneval-x).
|
| 8 |
+
You can download the dataset from this [download link](https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx). Please download the language file (××.jsonl.gz) that needs to be evaluated and place it in the `./data/humanevalx` folder.
|
| 9 |
+
|
| 10 |
+
The currently supported languages are `python`, `cpp`, `go`, `java`, `js`.
|
| 11 |
+
|
| 12 |
+
2. DS1000
|
| 13 |
+
|
| 14 |
+
This is a Python multi-algorithm library dataset [ds1000](https://github.com/xlang-ai/DS-1000).
|
| 15 |
+
You can download the dataset from this [download link](https://github.com/xlang-ai/DS-1000/blob/main/ds1000_data.zip).
|
| 16 |
+
|
| 17 |
+
The currently supported algorithm libraries are `Pandas`, `Numpy`, `Tensorflow`, `Scipy`, `Sklearn`, `Pytorch`, `Matplotlib`.
|
| 18 |
+
|
| 19 |
+
## Launching the Code Evaluation Service
|
| 20 |
+
|
| 21 |
+
1. Ensure you have installed Docker, please refer to [Docker installation document](https://docs.docker.com/engine/install/).
|
| 22 |
+
2. Pull the source code of the code evaluation service project and build the Docker image.
|
| 23 |
+
|
| 24 |
+
Choose the dockerfile corresponding to the dataset you need, and replace `humanevalx` or `ds1000` in the command below.
|
| 25 |
+
|
| 26 |
+
```shell
|
| 27 |
+
git clone https://github.com/open-compass/code-evaluator.git
|
| 28 |
+
docker build -t code-eval-{your-dataset}:latest -f docker/{your-dataset}/Dockerfile .
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
3. Create a container with the following commands:
|
| 32 |
+
|
| 33 |
+
```shell
|
| 34 |
+
# Log output format
|
| 35 |
+
docker run -it -p 5000:5000 code-eval-{your-dataset}:latest python server.py
|
| 36 |
+
|
| 37 |
+
# Run the program in the background
|
| 38 |
+
# docker run -itd -p 5000:5000 code-eval-{your-dataset}:latest python server.py
|
| 39 |
+
|
| 40 |
+
# Using different ports
|
| 41 |
+
# docker run -itd -p 5001:5001 code-eval-{your-dataset}:latest python server.py --port 5001
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
**Note:**
|
| 45 |
+
|
| 46 |
+
- If you encounter a timeout during the evaluation of Go, please use the following command when creating the container.
|
| 47 |
+
|
| 48 |
+
```shell
|
| 49 |
+
docker run -it -p 5000:5000 -e GO111MODULE=on -e GOPROXY=https://goproxy.io code-eval-{your-dataset}:latest python server.py
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
4. To ensure you have access to the service, use the following command to check the inference environment and evaluation service connection status. (If both inferences and code evaluations run on the same host, skip this step.)
|
| 53 |
+
|
| 54 |
+
```shell
|
| 55 |
+
ping your_service_ip_address
|
| 56 |
+
telnet your_service_ip_address your_service_port
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Local Code Evaluation
|
| 60 |
+
|
| 61 |
+
When the model inference and code evaluation services are running on the same host or within the same local area network, direct code reasoning and evaluation can be performed. **Note: DS1000 is currently not supported, please proceed with remote evaluation.**
|
| 62 |
+
|
| 63 |
+
### Configuration File
|
| 64 |
+
|
| 65 |
+
We provide [the configuration file](https://github.com/open-compass/opencompass/blob/main/configs/eval_codegeex2.py) of using `humanevalx` for evaluation on `codegeex2` as reference.
|
| 66 |
+
|
| 67 |
+
The dataset and related post-processing configurations files can be found at this [link](https://github.com/open-compass/opencompass/tree/main/configs/datasets/humanevalx) with attention paid to the `evaluator` field in the humanevalx_eval_cfg_dict.
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 71 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 72 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 73 |
+
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
|
| 74 |
+
|
| 75 |
+
humanevalx_reader_cfg = dict(
|
| 76 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
| 77 |
+
|
| 78 |
+
humanevalx_infer_cfg = dict(
|
| 79 |
+
prompt_template=dict(
|
| 80 |
+
type=PromptTemplate,
|
| 81 |
+
template='{prompt}'),
|
| 82 |
+
retriever=dict(type=ZeroRetriever),
|
| 83 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
| 84 |
+
|
| 85 |
+
humanevalx_eval_cfg_dict = {
|
| 86 |
+
lang : dict(
|
| 87 |
+
evaluator=dict(
|
| 88 |
+
type=HumanevalXEvaluator,
|
| 89 |
+
language=lang,
|
| 90 |
+
ip_address="localhost", # replace to your code_eval_server ip_address, port
|
| 91 |
+
port=5000), # refer to https://github.com/open-compass/code-evaluator to launch a server
|
| 92 |
+
pred_role='BOT')
|
| 93 |
+
for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
humanevalx_datasets = [
|
| 97 |
+
dict(
|
| 98 |
+
type=HumanevalXDataset,
|
| 99 |
+
abbr=f'humanevalx-{lang}',
|
| 100 |
+
language=lang,
|
| 101 |
+
path='./data/humanevalx',
|
| 102 |
+
reader_cfg=humanevalx_reader_cfg,
|
| 103 |
+
infer_cfg=humanevalx_infer_cfg,
|
| 104 |
+
eval_cfg=humanevalx_eval_cfg_dict[lang])
|
| 105 |
+
for lang in ['python', 'cpp', 'go', 'java', 'js']
|
| 106 |
+
]
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### Task Launch
|
| 110 |
+
|
| 111 |
+
Refer to the [Quick Start](../get_started.html)
|
| 112 |
+
|
| 113 |
+
## Remote Code Evaluation
|
| 114 |
+
|
| 115 |
+
Model inference and code evaluation services located in different machines which cannot be accessed directly require prior model inference before collecting the code evaluation results. The configuration file and inference process can be reused from the previous tutorial.
|
| 116 |
+
|
| 117 |
+
### Collect Inference Results(Only for Humanevalx)
|
| 118 |
+
|
| 119 |
+
In OpenCompass's tools folder, there is a script called `collect_code_preds.py` provided to process and collect the inference results after providing the task launch configuration file during startup along with specifying the working directory used corresponding to the task.
|
| 120 |
+
It is the same with `-r` option in `run.py`. More details can be referred through the [documentation](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html#launching-evaluation).
|
| 121 |
+
|
| 122 |
+
```shell
|
| 123 |
+
python tools/collect_code_preds.py [config] [-r latest]
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
The collected results will be organized as following under the `-r` folder:
|
| 127 |
+
|
| 128 |
+
```
|
| 129 |
+
workdir/humanevalx
|
| 130 |
+
├── codegeex2-6b
|
| 131 |
+
│ ├── humanevalx_cpp.json
|
| 132 |
+
│ ├── humanevalx_go.json
|
| 133 |
+
│ ├── humanevalx_java.json
|
| 134 |
+
│ ├── humanevalx_js.json
|
| 135 |
+
│ └── humanevalx_python.json
|
| 136 |
+
├── CodeLlama-13b
|
| 137 |
+
│ ├── ...
|
| 138 |
+
├── CodeLlama-13b-Instruct
|
| 139 |
+
│ ├── ...
|
| 140 |
+
├── CodeLlama-13b-Python
|
| 141 |
+
│ ├── ...
|
| 142 |
+
├── ...
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
For DS1000, you just need to obtain the corresponding prediction file generated by `opencompass`.
|
| 146 |
+
|
| 147 |
+
### Code Evaluation
|
| 148 |
+
|
| 149 |
+
Make sure your code evaluation service is started, and use `curl` to request:
|
| 150 |
+
|
| 151 |
+
#### The following only supports Humanevalx
|
| 152 |
+
|
| 153 |
+
```shell
|
| 154 |
+
curl -X POST -F 'file=@{result_absolute_path}' -F 'dataset={dataset/language}' {your_service_ip_address}:{your_service_port}/evaluate
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
For example:
|
| 158 |
+
|
| 159 |
+
```shell
|
| 160 |
+
curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' localhost:5000/evaluate
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
The we have:
|
| 164 |
+
|
| 165 |
+
```
|
| 166 |
+
"{\"pass@1\": 37.19512195121951%}"
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
Additionally, we offer an extra option named `with_prompt`(Defaults to `True`), since some models(like `WizardCoder`) generate complete codes without requiring the form of concatenating prompt and prediction. You may refer to the following commands for evaluation.
|
| 170 |
+
|
| 171 |
+
```shell
|
| 172 |
+
curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' -H 'with-prompt: False' localhost:5000/evaluate
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
#### The following only supports DS1000
|
| 176 |
+
|
| 177 |
+
Make sure the code evaluation service is started, then use `curl` to submit a request:
|
| 178 |
+
|
| 179 |
+
```shell
|
| 180 |
+
curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' localhost:5000/evaluate
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
DS1000 supports additional debug parameters. Be aware that a large amount of log will be generated when it is turned on:
|
| 184 |
+
|
| 185 |
+
- `full`: Additional print out of the original prediction for each error sample, post-processing prediction, running program, and final error.
|
| 186 |
+
- `half`: Additional print out of the running program and final error for each error sample.
|
| 187 |
+
- `error`: Additional print out of the final error for each error sample.
|
| 188 |
+
|
| 189 |
+
```shell
|
| 190 |
+
curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' -F 'debug=error' localhost:5000/evaluate
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
You can also modify the `num_workers` in the same way to control the degree of parallelism.
|
| 194 |
+
|
| 195 |
+
## Advanced Tutorial
|
| 196 |
+
|
| 197 |
+
Besides evaluating the supported HUMANEVAList data set, users might also need:
|
| 198 |
+
|
| 199 |
+
### Support New Dataset
|
| 200 |
+
|
| 201 |
+
Please refer to the [tutorial on supporting new datasets](./new_dataset.md).
|
| 202 |
+
|
| 203 |
+
### Modify Post-Processing
|
| 204 |
+
|
| 205 |
+
1. For local evaluation, follow the post-processing section in the tutorial on supporting new datasets to modify the post-processing method.
|
| 206 |
+
2. For remote evaluation, please modify the post-processing part in the tool's `collect_code_preds.py`.
|
| 207 |
+
3. Some parts of post-processing could also be modified in the code evaluation service, more information will be available in the next section.
|
| 208 |
+
|
| 209 |
+
### Debugging Code Evaluation Service
|
| 210 |
+
|
| 211 |
+
When supporting new datasets or modifying post-processors, it is possible that modifications need to be made to the original code evaluation service. Please make changes based on the following steps:
|
| 212 |
+
|
| 213 |
+
1. Remove the installation of the `code-evaluator` in `Dockerfile`, mount the `code-evaluator` when starting the container instead:
|
| 214 |
+
|
| 215 |
+
```shell
|
| 216 |
+
docker run -it -p 5000:5000 -v /local/path/of/code-evaluator:/workspace/code-evaluator code-eval:latest bash
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
2. Install and start the code evaluation service locally. At this point, any necessary modifications can be made to the local copy of the `code-evaluator`.
|
| 220 |
+
|
| 221 |
+
```shell
|
| 222 |
+
cd code-evaluator && pip install -r requirements.txt
|
| 223 |
+
python server.py
|
| 224 |
+
```
|
opencompass/docs/en/advanced_guides/contamination_eval.md
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Contamination Assessment
|
| 2 |
+
|
| 3 |
+
**Data Contamination** refers to the phenomenon where data intended for downstream testing tasks appear in the training data of large language models (LLMs), resulting in artificially inflated performance metrics in downstream tasks (such as summarization, natural language inference, text classification), which do not accurately reflect the model's true generalization capabilities.
|
| 4 |
+
|
| 5 |
+
Since the source of data contamination lies in the training data used by LLMs, the most direct method to detect data contamination is to collide test data with training data and then report the extent of overlap between the two. The classic GPT-3 [paper](https://arxiv.org/pdf/2005.14165.pdf) reported on this in Table C.1.
|
| 6 |
+
|
| 7 |
+
However, today's open-source community often only publishes model parameters, not training datasets. In such cases, how to determine the presence and extent of data contamination remains unsolved. OpenCompass offers two possible solutions.
|
| 8 |
+
|
| 9 |
+
## Contamination Data Annotation Based on Self-Built Co-Distribution Data
|
| 10 |
+
|
| 11 |
+
Referencing the method mentioned in Section 5.2 of [Skywork](https://arxiv.org/pdf/2310.19341.pdf), we directly used the dataset [mock_gsm8k_test](https://huggingface.co/datasets/Skywork/mock_gsm8k_test) uploaded to HuggingFace by Skywork.
|
| 12 |
+
|
| 13 |
+
In this method, the authors used GPT-4 to synthesize data similar to the original GSM8K style, and then calculated the perplexity on the GSM8K training set (train), GSM8K test set (test), and GSM8K reference set (ref). Since the GSM8K reference set was newly generated, the authors considered it as clean, not belonging to any training set of any model. They posited:
|
| 14 |
+
|
| 15 |
+
- If the test set's perplexity is significantly lower than the reference set's, the test set might have appeared in the model's training phase;
|
| 16 |
+
- If the training set's perplexity is significantly lower than the test set's, the training set might have been overfitted by the model.
|
| 17 |
+
|
| 18 |
+
The following configuration file can be referenced:
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
from mmengine.config import read_base
|
| 22 |
+
|
| 23 |
+
with read_base():
|
| 24 |
+
from .datasets.gsm8k_contamination.gsm8k_contamination_ppl_ecdd22 import gsm8k_datasets # includes training, test, and reference sets
|
| 25 |
+
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model # model under review
|
| 26 |
+
from .models.yi.hf_yi_6b import models as hf_yi_6b_model
|
| 27 |
+
|
| 28 |
+
datasets = [*gsm8k_datasets]
|
| 29 |
+
models = [*hf_qwen_7b_model, *hf_yi_6b_model]
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
An example output is as follows:
|
| 33 |
+
|
| 34 |
+
```text
|
| 35 |
+
dataset version metric mode internlm-7b-hf qwen-7b-hf yi-6b-hf chatglm3-6b-base-hf qwen-14b-hf baichuan2-13b-base-hf internlm-20b-hf aquila2-34b-hf ...
|
| 36 |
+
--------------- --------- ----------- ------- ---------------- ------------ ---------- --------------------- ------------- ----------------------- ----------------- ---------------- ...
|
| 37 |
+
gsm8k-train-ppl 0b8e46 average_ppl unknown 1.5 0.78 1.37 1.16 0.5 0.76 1.41 0.78 ...
|
| 38 |
+
gsm8k-test-ppl 0b8e46 average_ppl unknown 1.56 1.33 1.42 1.3 1.15 1.13 1.52 1.16 ...
|
| 39 |
+
gsm8k-ref-ppl f729ba average_ppl unknown 1.55 1.2 1.43 1.35 1.27 1.19 1.47 1.35 ...
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Currently, this solution only supports the GSM8K dataset. We welcome the community to contribute more datasets.
|
| 43 |
+
|
| 44 |
+
Consider cite the following paper if you find it helpful:
|
| 45 |
+
|
| 46 |
+
```bibtex
|
| 47 |
+
@misc{2023opencompass,
|
| 48 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 49 |
+
author={OpenCompass Contributors},
|
| 50 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
| 51 |
+
year={2023}
|
| 52 |
+
}
|
| 53 |
+
@misc{wei2023skywork,
|
| 54 |
+
title={Skywork: A More Open Bilingual Foundation Model},
|
| 55 |
+
author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
|
| 56 |
+
year={2023},
|
| 57 |
+
eprint={2310.19341},
|
| 58 |
+
archivePrefix={arXiv},
|
| 59 |
+
primaryClass={cs.CL}
|
| 60 |
+
}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Contamination Data Annotation Based on Classic Pre-trained Sets
|
| 64 |
+
|
| 65 |
+
Thanks to [Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) and @liyucheng09 for providing this method.
|
| 66 |
+
|
| 67 |
+
In this method, the authors search the test datasets (such as C-Eval, ARC, HellaSwag, etc.) using the Common Crawl database and Bing search engine, then mark each test sample as clean / question contaminated / both question and answer contaminated.
|
| 68 |
+
|
| 69 |
+
During testing, OpenCompass
|
| 70 |
+
|
| 71 |
+
will report the accuracy or perplexity of ceval on subsets composed of these three labels. Generally, the accuracy ranges from low to high: clean, question contaminated, both question and answer contaminated subsets. The authors believe:
|
| 72 |
+
|
| 73 |
+
- If the performance of the three is relatively close, the contamination level of the model on that test set is light; otherwise, it is heavy.
|
| 74 |
+
|
| 75 |
+
The following configuration file can be referenced [link](https://github.com/open-compass/opencompass/blob/main/configs/eval_contamination.py):
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
from mmengine.config import read_base
|
| 79 |
+
|
| 80 |
+
with read_base():
|
| 81 |
+
from .datasets.ceval.ceval_clean_ppl import ceval_datasets # ceval dataset with contamination tags
|
| 82 |
+
from .models.yi.hf_yi_6b import models as hf_yi_6b_model # model under review
|
| 83 |
+
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model
|
| 84 |
+
from .summarizers.contamination import ceval_summarizer as summarizer # output formatting
|
| 85 |
+
|
| 86 |
+
datasets = [*ceval_datasets]
|
| 87 |
+
models = [*hf_yi_6b_model, *hf_qwen_7b_model]
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
An example output is as follows:
|
| 91 |
+
|
| 92 |
+
```text
|
| 93 |
+
dataset version mode yi-6b-hf - - qwen-7b-hf - - ...
|
| 94 |
+
---------------------------------------------- --------- ------ ---------------- ----------------------------- --------------------------------------- ---------------- ----------------------------- --------------------------------------- ...
|
| 95 |
+
- - - accuracy - clean accuracy - input contaminated accuracy - input-and-label contaminated accuracy - clean accuracy - input contaminated accuracy - input-and-label contaminated ...
|
| 96 |
+
...
|
| 97 |
+
ceval-humanities - ppl 74.42 75.00 82.14 67.44 50.00 70.54 ...
|
| 98 |
+
ceval-stem - ppl 53.70 57.14 85.61 47.41 52.38 67.63 ...
|
| 99 |
+
ceval-social-science - ppl 81.60 84.62 83.09 76.00 61.54 72.79 ...
|
| 100 |
+
ceval-other - ppl 72.31 73.91 75.00 58.46 39.13 61.88 ...
|
| 101 |
+
ceval-hard - ppl 44.35 37.50 70.00 41.13 25.00 30.00 ...
|
| 102 |
+
ceval - ppl 67.32 71.01 81.17 58.97 49.28 67.82 ...
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Currently, this solution only supports the C-Eval, MMLU, HellaSwag and ARC. [Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) also includes CSQA and WinoGrande, but these have not yet been implemented in OpenCompass. We welcome the community to contribute more datasets.
|
| 106 |
+
|
| 107 |
+
Consider cite the following paper if you find it helpful:
|
| 108 |
+
|
| 109 |
+
```bibtex
|
| 110 |
+
@misc{2023opencompass,
|
| 111 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 112 |
+
author={OpenCompass Contributors},
|
| 113 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
| 114 |
+
year={2023}
|
| 115 |
+
}
|
| 116 |
+
@article{Li2023AnOS,
|
| 117 |
+
title={An Open Source Data Contamination Report for Llama Series Models},
|
| 118 |
+
author={Yucheng Li},
|
| 119 |
+
journal={ArXiv},
|
| 120 |
+
year={2023},
|
| 121 |
+
volume={abs/2310.17589},
|
| 122 |
+
url={https://api.semanticscholar.org/CorpusID:264490711}
|
| 123 |
+
}
|
| 124 |
+
```
|
opencompass/docs/en/advanced_guides/custom_dataset.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Custom Dataset Tutorial
|
| 2 |
+
|
| 3 |
+
This tutorial is intended for temporary and informal use of datasets. If the dataset requires long-term use or has specific needs for custom reading/inference/evaluation, it is strongly recommended to implement it according to the methods described in [new_dataset.md](./new_dataset.md).
|
| 4 |
+
|
| 5 |
+
In this tutorial, we will introduce how to test a new dataset without implementing a config or modifying the OpenCompass source code. We support two types of tasks: multiple choice (`mcq`) and question & answer (`qa`). For `mcq`, both ppl and gen inferences are supported; for `qa`, gen inference is supported.
|
| 6 |
+
|
| 7 |
+
## Dataset Format
|
| 8 |
+
|
| 9 |
+
We support datasets in both `.jsonl` and `.csv` formats.
|
| 10 |
+
|
| 11 |
+
### Multiple Choice (`mcq`)
|
| 12 |
+
|
| 13 |
+
For `mcq` datasets, the default fields are as follows:
|
| 14 |
+
|
| 15 |
+
- `question`: The stem of the multiple-choice question.
|
| 16 |
+
- `A`, `B`, `C`, ...: Single uppercase letters representing the options, with no limit on the number. Defaults to parsing consecutive letters strating from `A` as options.
|
| 17 |
+
- `answer`: The correct answer to the multiple-choice question, which must be one of the options used above, such as `A`, `B`, `C`, etc.
|
| 18 |
+
|
| 19 |
+
Non-default fields will be read in but are not used by default. To use them, specify in the `.meta.json` file.
|
| 20 |
+
|
| 21 |
+
An example of the `.jsonl` format:
|
| 22 |
+
|
| 23 |
+
```jsonl
|
| 24 |
+
{"question": "165+833+650+615=", "A": "2258", "B": "2263", "C": "2281", "answer": "B"}
|
| 25 |
+
{"question": "368+959+918+653+978=", "A": "3876", "B": "3878", "C": "3880", "answer": "A"}
|
| 26 |
+
{"question": "776+208+589+882+571+996+515+726=", "A": "5213", "B": "5263", "C": "5383", "answer": "B"}
|
| 27 |
+
{"question": "803+862+815+100+409+758+262+169=", "A": "4098", "B": "4128", "C": "4178", "answer": "C"}
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
An example of the `.csv` format:
|
| 31 |
+
|
| 32 |
+
```csv
|
| 33 |
+
question,A,B,C,answer
|
| 34 |
+
127+545+588+620+556+199=,2632,2635,2645,B
|
| 35 |
+
735+603+102+335+605=,2376,2380,2410,B
|
| 36 |
+
506+346+920+451+910+142+659+850=,4766,4774,4784,C
|
| 37 |
+
504+811+870+445=,2615,2630,2750,B
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### Question & Answer (`qa`)
|
| 41 |
+
|
| 42 |
+
For `qa` datasets, the default fields are as follows:
|
| 43 |
+
|
| 44 |
+
- `question`: The stem of the question & answer question.
|
| 45 |
+
- `answer`: The correct answer to the question & answer question. It can be missing, indicating the dataset has no correct answer.
|
| 46 |
+
|
| 47 |
+
Non-default fields will be read in but are not used by default. To use them, specify in the `.meta.json` file.
|
| 48 |
+
|
| 49 |
+
An example of the `.jsonl` format:
|
| 50 |
+
|
| 51 |
+
```jsonl
|
| 52 |
+
{"question": "752+361+181+933+235+986=", "answer": "3448"}
|
| 53 |
+
{"question": "712+165+223+711=", "answer": "1811"}
|
| 54 |
+
{"question": "921+975+888+539=", "answer": "3323"}
|
| 55 |
+
{"question": "752+321+388+643+568+982+468+397=", "answer": "4519"}
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
An example of the `.csv` format:
|
| 59 |
+
|
| 60 |
+
```csv
|
| 61 |
+
question,answer
|
| 62 |
+
123+147+874+850+915+163+291+604=,3967
|
| 63 |
+
149+646+241+898+822+386=,3142
|
| 64 |
+
332+424+582+962+735+798+653+214=,4700
|
| 65 |
+
649+215+412+495+220+738+989+452=,4170
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Command Line List
|
| 69 |
+
|
| 70 |
+
Custom datasets can be directly called for evaluation through the command line.
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
python run.py \
|
| 74 |
+
--models hf_llama2_7b \
|
| 75 |
+
--custom-dataset-path xxx/test_mcq.csv \
|
| 76 |
+
--custom-dataset-data-type mcq \
|
| 77 |
+
--custom-dataset-infer-method ppl
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
python run.py \
|
| 82 |
+
--models hf_llama2_7b \
|
| 83 |
+
--custom-dataset-path xxx/test_qa.jsonl \
|
| 84 |
+
--custom-dataset-data-type qa \
|
| 85 |
+
--custom-dataset-infer-method gen
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
In most cases, `--custom-dataset-data-type` and `--custom-dataset-infer-method` can be omitted. OpenCompass will
|
| 89 |
+
|
| 90 |
+
set them based on the following logic:
|
| 91 |
+
|
| 92 |
+
- If options like `A`, `B`, `C`, etc., can be parsed from the dataset file, it is considered an `mcq` dataset; otherwise, it is considered a `qa` dataset.
|
| 93 |
+
- The default `infer_method` is `gen`.
|
| 94 |
+
|
| 95 |
+
## Configuration File
|
| 96 |
+
|
| 97 |
+
In the original configuration file, simply add a new item to the `datasets` variable. Custom datasets can be mixed with regular datasets.
|
| 98 |
+
|
| 99 |
+
```python
|
| 100 |
+
datasets = [
|
| 101 |
+
{"path": "xxx/test_mcq.csv", "data_type": "mcq", "infer_method": "ppl"},
|
| 102 |
+
{"path": "xxx/test_qa.jsonl", "data_type": "qa", "infer_method": "gen"},
|
| 103 |
+
]
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## Supplemental Information for Dataset `.meta.json`
|
| 107 |
+
|
| 108 |
+
OpenCompass will try to parse the input dataset file by default, so in most cases, the `.meta.json` file is **not necessary**. However, if the dataset field names are not the default ones, or custom prompt words are required, it should be specified in the `.meta.json` file.
|
| 109 |
+
|
| 110 |
+
The file is placed in the same directory as the dataset, with the filename followed by `.meta.json`. An example file structure is as follows:
|
| 111 |
+
|
| 112 |
+
```tree
|
| 113 |
+
.
|
| 114 |
+
├── test_mcq.csv
|
| 115 |
+
├── test_mcq.csv.meta.json
|
| 116 |
+
├── test_qa.jsonl
|
| 117 |
+
└── test_qa.jsonl.meta.json
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
Possible fields in this file include:
|
| 121 |
+
|
| 122 |
+
- `abbr` (str): Abbreviation of the dataset, serving as its ID.
|
| 123 |
+
- `data_type` (str): Type of dataset, options are `mcq` and `qa`.
|
| 124 |
+
- `infer_method` (str): Inference method, options are `ppl` and `gen`.
|
| 125 |
+
- `human_prompt` (str): User prompt template for generating prompts. Variables in the template are enclosed in `{}`, like `{question}`, `{opt1}`, etc. If `template` exists, this field will be ignored.
|
| 126 |
+
- `bot_prompt` (str): Bot prompt template for generating prompts. Variables in the template are enclosed in `{}`, like `{answer}`, etc. If `template` exists, this field will be ignored.
|
| 127 |
+
- `template` (str or dict): Question template for generating prompts. Variables in the template are enclosed in `{}`, like `{question}`, `{opt1}`, etc. The relevant syntax is in [here](../prompt/prompt_template.md) regarding `infer_cfg['prompt_template']['template']`.
|
| 128 |
+
- `input_columns` (list): List of input fields for reading data.
|
| 129 |
+
- `output_column` (str): Output field for reading data.
|
| 130 |
+
- `options` (list): List of options for reading data, valid only when `data_type` is `mcq`.
|
| 131 |
+
|
| 132 |
+
For example:
|
| 133 |
+
|
| 134 |
+
```json
|
| 135 |
+
{
|
| 136 |
+
"human_prompt": "Question: 127 + 545 + 588 + 620 + 556 + 199 =\nA. 2632\nB. 2635\nC. 2645\nAnswer: Let's think step by step, 127 + 545 + 588 + 620 + 556 + 199 = 672 + 588 + 620 + 556 + 199 = 1260 + 620 + 556 + 199 = 1880 + 556 + 199 = 2436 + 199 = 2635. So the answer is B.\nQuestion: {question}\nA. {A}\nB. {B}\nC. {C}\nAnswer: ",
|
| 137 |
+
"bot_prompt": "{answer}"
|
| 138 |
+
}
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
or
|
| 142 |
+
|
| 143 |
+
```json
|
| 144 |
+
{
|
| 145 |
+
"template": "Question: {my_question}\nX. {X}\nY. {Y}\nZ. {Z}\nW. {W}\nAnswer:",
|
| 146 |
+
"input_columns": ["my_question", "X", "Y", "Z", "W"],
|
| 147 |
+
"output_column": "my_answer",
|
| 148 |
+
}
|
| 149 |
+
```
|
opencompass/docs/en/advanced_guides/evaluation_lightllm.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluation with Lightllm
|
| 2 |
+
|
| 3 |
+
We now support the evaluation of large language models using [Lightllm](https://github.com/ModelTC/lightllm) for inference. Developed by SenseTime, LightLLM is a Python-based LLM (Large Language Model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance. Lightllm provides support for various large Language models, allowing users to perform model inference through Lightllm, locally deploying it as a service. During the evaluation process, OpenCompass feeds data to Lightllm through an API and processes the response. OpenCompass has been adapted for compatibility with Lightllm, and this tutorial will guide you on using OpenCompass to evaluate models with Lightllm as the inference backend.
|
| 4 |
+
|
| 5 |
+
## Setup
|
| 6 |
+
|
| 7 |
+
### Install OpenCompass
|
| 8 |
+
|
| 9 |
+
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
|
| 10 |
+
|
| 11 |
+
### Install Lightllm
|
| 12 |
+
|
| 13 |
+
Please follow the [Lightllm homepage](https://github.com/ModelTC/lightllm) to install the Lightllm. Pay attention to aligning the versions of relevant dependencies, especially the version of the Transformers.
|
| 14 |
+
|
| 15 |
+
## Evaluation
|
| 16 |
+
|
| 17 |
+
We use the evaluation of Humaneval with the llama2-7B model as an example.
|
| 18 |
+
|
| 19 |
+
### Step-1: Deploy the model locally as a service using Lightllm.
|
| 20 |
+
|
| 21 |
+
```shell
|
| 22 |
+
python -m lightllm.server.api_server --model_dir /path/llama2-7B \
|
| 23 |
+
--host 0.0.0.0 \
|
| 24 |
+
--port 1030 \
|
| 25 |
+
--nccl_port 2066 \
|
| 26 |
+
--max_req_input_len 4096 \
|
| 27 |
+
--max_req_total_len 6144 \
|
| 28 |
+
--tp 1 \
|
| 29 |
+
--trust_remote_code \
|
| 30 |
+
--max_total_token_num 120000
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
\*\*Note: \*\* tp can be configured to enable TensorParallel inference on several gpus, suitable for the inference of very large models.
|
| 34 |
+
|
| 35 |
+
\*\*Note: \*\* The max_total_token_num in the above command will affect the throughput performance during testing. It can be configured according to the documentation on the [Lightllm homepage](https://github.com/ModelTC/lightllm). As long as it does not run out of memory, it is often better to set it as high as possible.
|
| 36 |
+
|
| 37 |
+
\*\*Note: \*\* If you want to start multiple LightLLM services on the same machine, you need to reconfigure the above port and nccl_port.
|
| 38 |
+
|
| 39 |
+
You can use the following Python script to quickly test whether the current service has been successfully started.
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
import time
|
| 43 |
+
import requests
|
| 44 |
+
import json
|
| 45 |
+
|
| 46 |
+
url = 'http://localhost:8080/generate'
|
| 47 |
+
headers = {'Content-Type': 'application/json'}
|
| 48 |
+
data = {
|
| 49 |
+
'inputs': 'What is AI?',
|
| 50 |
+
"parameters": {
|
| 51 |
+
'do_sample': False,
|
| 52 |
+
'ignore_eos': False,
|
| 53 |
+
'max_new_tokens': 1024,
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 57 |
+
if response.status_code == 200:
|
| 58 |
+
print(response.json())
|
| 59 |
+
else:
|
| 60 |
+
print('Error:', response.status_code, response.text)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### Step-2: Evaluate the above model using OpenCompass.
|
| 64 |
+
|
| 65 |
+
```shell
|
| 66 |
+
python run.py configs/eval_lightllm.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
You are expected to get the evaluation results after the inference and evaluation.
|
| 70 |
+
|
| 71 |
+
\*\*Note: \*\*In `eval_lightllm.py`, please align the configured URL with the service address from the previous step.
|
opencompass/docs/en/advanced_guides/evaluation_lmdeploy.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluation with LMDeploy
|
| 2 |
+
|
| 3 |
+
We now support evaluation of models accelerated by the [LMDeploy](https://github.com/InternLM/lmdeploy). LMDeploy is a toolkit designed for compressing, deploying, and serving LLM. It has a remarkable inference performance. We now illustrate how to evaluate a model with the support of LMDeploy in OpenCompass.
|
| 4 |
+
|
| 5 |
+
## Setup
|
| 6 |
+
|
| 7 |
+
### Install OpenCompass
|
| 8 |
+
|
| 9 |
+
Please follow the [instructions](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) to install the OpenCompass and prepare the evaluation datasets.
|
| 10 |
+
|
| 11 |
+
### Install LMDeploy
|
| 12 |
+
|
| 13 |
+
Install lmdeploy via pip (python 3.8+)
|
| 14 |
+
|
| 15 |
+
```shell
|
| 16 |
+
pip install lmdeploy
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
The default prebuilt package is compiled on CUDA 12. However, if CUDA 11+ is required, you can install lmdeploy by:
|
| 20 |
+
|
| 21 |
+
```shell
|
| 22 |
+
export LMDEPLOY_VERSION=0.6.0
|
| 23 |
+
export PYTHON_VERSION=310
|
| 24 |
+
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Evaluation
|
| 28 |
+
|
| 29 |
+
When evaluating a model, it is necessary to prepare an evaluation configuration that specifies information such as the evaluation dataset, the model, and inference parameters.
|
| 30 |
+
|
| 31 |
+
Taking [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) as an example, the evaluation config is as follows:
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
# configure the dataset
|
| 35 |
+
from mmengine.config import read_base
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
with read_base():
|
| 39 |
+
# choose a list of datasets
|
| 40 |
+
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
| 41 |
+
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
| 42 |
+
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
| 43 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
| 44 |
+
gsm8k_datasets
|
| 45 |
+
# and output the results in a chosen format
|
| 46 |
+
from .summarizers.medium import summarizer
|
| 47 |
+
|
| 48 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 49 |
+
|
| 50 |
+
# configure lmdeploy
|
| 51 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# configure the model
|
| 56 |
+
models = [
|
| 57 |
+
dict(
|
| 58 |
+
type=TurboMindModelwithChatTemplate,
|
| 59 |
+
abbr=f'internlm2-chat-7b-lmdeploy',
|
| 60 |
+
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
|
| 61 |
+
path='internlm/internlm2-chat-7b',
|
| 62 |
+
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
| 63 |
+
# If the model is not supported by 'turbomind', it will fallback to
|
| 64 |
+
# 'pytorch'
|
| 65 |
+
backend='turbomind',
|
| 66 |
+
# For the detailed engine config and generation config, please refer to
|
| 67 |
+
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
| 68 |
+
engine_config=dict(tp=1),
|
| 69 |
+
gen_config=dict(do_sample=False),
|
| 70 |
+
# the max size of the context window
|
| 71 |
+
max_seq_len=7168,
|
| 72 |
+
# the max number of new tokens
|
| 73 |
+
max_out_len=1024,
|
| 74 |
+
# the max number of prompts that LMDeploy receives
|
| 75 |
+
# in `generate` function
|
| 76 |
+
batch_size=5000,
|
| 77 |
+
run_cfg=dict(num_gpus=1),
|
| 78 |
+
)
|
| 79 |
+
]
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
Place the aforementioned configuration in a file, such as "configs/eval_internlm2_lmdeploy.py". Then, in the home folder of OpenCompass, start evaluation by the following command:
|
| 83 |
+
|
| 84 |
+
```shell
|
| 85 |
+
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
You are expected to get the evaluation results after the inference and evaluation.
|
opencompass/docs/en/advanced_guides/longeval.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Long Context Evaluation Guidance
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
Although large-scale language models (LLMs) such as GPT-4 have demonstrated significant advantages in handling natural language tasks, most current open-source models can only handle texts with a length of a few thousand tokens, which limits their ability to process long contexts such as reading books and writing text summaries. To explore the performance of models in dealing with long contexts, we use the [L-Eval](https://github.com/OpenLMLab/LEval) and [LongBench](https://github.com/THUDM/LongBench) datasets to test the model's ability to handle long contexts.
|
| 6 |
+
|
| 7 |
+
## Existing Algorithms and models
|
| 8 |
+
|
| 9 |
+
When dealing with long context inputs, the two main challenges faced by large models are the inference time cost and catastrophic forgetting. Recently, a large amount of research has been devoted to extending the model length, focusing on three improvement directions:
|
| 10 |
+
|
| 11 |
+
- Attention mechanisms. The ultimate goal of these methods is to reduce the computation cost of query-key pairs, but they may affect the performance of downstream tasks.
|
| 12 |
+
- Input methods. Some studies divide long context inputs into chunks or retrieve pre-existing text segments to enhance the model's ability to handle long contexts, but these methods are only effective for some tasks and are difficult to adapt to multiple downstream tasks.
|
| 13 |
+
- Position encoding. This research includes RoPE, ALiBi, Position Interpolation etc., which have shown good results in length extrapolation. These methods have been used to train long context models such as ChatGLM2-6B-32k and LongChat-32k.
|
| 14 |
+
|
| 15 |
+
First, we introduce some popular position encoding algorithms.
|
| 16 |
+
|
| 17 |
+
### RoPE
|
| 18 |
+
|
| 19 |
+
RoPE is a type of positional embedding that injects the information of position in Transformer. It encodes the absolute position with a rotation matrix and meanwhile incorporates the explicit relative position dependency in self-attention formulation. A graphic illustration of RoPE is shown below.
|
| 20 |
+
|
| 21 |
+
<div align="center">
|
| 22 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/08c57958-0dcb-40d7-b91b-33f20ca2d89f>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
RoPE comes with valuable properties such as flexibility of being expand to any sequence lengths, decaying inter-token dependency with increasing relative distances, and capability of equipping the linear self-attention with relative position encoding.
|
| 26 |
+
|
| 27 |
+
RoPE is adopted in many LLMs including LLaMA, LLaMA 2 and Vicuna-7b-v1.5-16k.
|
| 28 |
+
|
| 29 |
+
### ALiBi
|
| 30 |
+
|
| 31 |
+
Though RoPE and other alternatives to the original sinusoidal position method(like T5 bias) have improved extrapolation, they are considerably slower than the sinusoidal approach and use extra memory and parameter. Therefore, Attention with Linear Biases (ALiBi) is introduced to facilitate efficient extrapolation.
|
| 32 |
+
|
| 33 |
+
For an input subsequence of length L, the attention sublayer computes the attention scores for the ith query
|
| 34 |
+
|
| 35 |
+
```{math}
|
| 36 |
+
q_{i} \in R^{1 \times d}, (1 \leq i \leq L)
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
in each head, given the first i keys
|
| 40 |
+
|
| 41 |
+
```{math}
|
| 42 |
+
K \in R^{i \times d}
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
where d is the head dimension.
|
| 46 |
+
|
| 47 |
+
```{math}
|
| 48 |
+
softmax(q_{i}K^{T})
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
ALiBi negatively biases attention scores with a linearly decreasing penalty proportional to the distance between the relevant key and query. The only modification it applies is after the query-key dot product, where it adds a static, non-learned bias.
|
| 52 |
+
|
| 53 |
+
```{math}
|
| 54 |
+
softmax(q_{i}K^{T}+m\cdot[-(i-1),...,-2,-1,0])
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
where scalar m is a head-specific slope fixed before training.
|
| 58 |
+
|
| 59 |
+
ALiBi eliminates position embeddings and it is as fast as the sinusoidal approach. It is used in LLMs including mpt-7b-storywriter, which is prepared to handle extremely long inputs.
|
| 60 |
+
|
| 61 |
+
### Position Interpolation(PI)
|
| 62 |
+
|
| 63 |
+
Many existing pre-trained LLMs including LLaMA use positional encodings that have weak extrapolation properties(e.g. RoPE). Position Interpolation is proposed and it can easily enable very long context windows while preserving model quality relatively well for the tasks within its original context window size.
|
| 64 |
+
|
| 65 |
+
The key idea of Position Interpolation is directly down-scale the position indices so that the maximum position index matches the previous context window limit in the pre-training stage. In other words, to accommodate more input tokens, the algorithm interpolates position encodings at neighboring integer positions, utilizing the fact that position encodings can be applied on non-integer positions, as opposed toextrapolating outside the trained positions, which may lead to catastrophic values. The algorithm requires only a very short period of fine-tuning for the model to fully adapt to greatly extended context windows.
|
| 66 |
+
|
| 67 |
+
An illustration of Position Interpolation method is shown below. Lower left illustrates Position Interpolation where it downscales the position indices (blue and green dots) themselves from \[0, 4096\] to \[0, 2048\] to force them to reside in the pretrained range.
|
| 68 |
+
|
| 69 |
+
<div align="center">
|
| 70 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/406454ba-a811-4c66-abbe-3a5528947257>
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
Position Interpolation empowers ChatGLM2-6B-32k, a model based on ChatGLM2-6B, to deal with a 32k context window size.
|
| 74 |
+
|
| 75 |
+
Next, we introduce some long context language models we evaluate.
|
| 76 |
+
|
| 77 |
+
### XGen-7B-8k
|
| 78 |
+
|
| 79 |
+
XGen-7B-8k is trained with standard dense attention on up to 8k sequence length for up to 1.5T tokens. To mitigate slow training, XGen-7B-8k introduces training in stages with increasing sequence length. First, 800B tokens with sequence length of 2k tokens are observed, then 400B tokens with 4k, finally, 300B tokens with 8k length.
|
| 80 |
+
|
| 81 |
+
### Vicuna-7b-v1.5-16k
|
| 82 |
+
|
| 83 |
+
Vicuna-7b-v1.5-16k is fine-tuned from LLaMA 2 with supervised instruction fine-tuning and linear RoPE scaling. The training data is around 125K conversations collected from ShareGPT, a website where users can share their ChatGPT conversation. These conversations are packed into sequences that contain 16k tokens each.
|
| 84 |
+
|
| 85 |
+
### LongChat-7b-v1.5-32k
|
| 86 |
+
|
| 87 |
+
LongChat-7b-v1.5-32k is fine-tuned from LLaMA 2 models, which were originally pretrained with 4k context length. The training recipe can be conceptually described in two steps. The first step is condensing RoPE. Since the LLaMA model has not observed scenarios where position_ids > 4096 during the pre-training phase, LongChat condenses position_ids > 4096 to be within 0 to 4096. The second step is fine-tuning LongChat model on curated conversation data. In this step, the data is cleaned using FastChat data pipeline and truncated to the maximum length of model.
|
| 88 |
+
|
| 89 |
+
### ChatGLM2-6B-32k
|
| 90 |
+
|
| 91 |
+
The ChatGLM2-6B-32k further strengthens the ability to understand long texts based on the ChatGLM2-6B. Based on the method of Positional Interpolation, and trained with a 32K context length during the dialogue alignment, ChatGLM2-6B-32k can better handle up to 32K context length.
|
| 92 |
+
|
| 93 |
+
## [L-Eval](https://github.com/OpenLMLab/LEval)
|
| 94 |
+
|
| 95 |
+
L-Eval is a long context dataset built by OpenLMLab, consisting of 18 subtasks, including texts from various fields such as law, economy, and technology. The dataset consists of a total of 411 documents, over 2000 test cases, with an average document length of 7217 words. The subtasks in this dataset are divided into close-ended and open-ended categories, with 5 close-ended tasks evaluated using the exact match criterion and 13 open-ended tasks evaluated using Rouge scores.
|
| 96 |
+
|
| 97 |
+
## [LongBench](https://github.com/THUDM/LongBench)
|
| 98 |
+
|
| 99 |
+
LongBench is a long context dataset built by THUDM, consisting of 21 subtasks with a total of 4750 test cases. This dataset is the first long context dataset that includes both English and Chinese texts, with an average English text length of 6711 words and an average Chinese text length of 13386 characters. The 21 subtasks are divided into 6 types, providing a more comprehensive evaluation of the model's capabilities in various aspects.
|
| 100 |
+
|
| 101 |
+
<div align="center">
|
| 102 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/4555e937-c519-4e9c-ad8d-7370430d466a>
|
| 103 |
+
</div>
|
| 104 |
+
|
| 105 |
+
## Evaluation Method
|
| 106 |
+
|
| 107 |
+
Due to the different maximum input lengths accepted by different models, in order to compare these large models more fairly, when the input length exceeds the maximum input limit of the model, we will trim the middle part of the input text to avoid missing prompt words.
|
| 108 |
+
|
| 109 |
+
## Long Context Ability Ranking
|
| 110 |
+
|
| 111 |
+
In the LongBench and L-Eval ability rankings, we select the average ranking **(The lower the better)** of each model in the subtask as the standard. It can be seen that GPT-4 and GPT-3.5-turbo-16k still occupy a leading position in long context tasks, while models like ChatGLM2-6B-32k also show significant improvement in long context ability after position interpolation based on ChatGLM2-6B.
|
| 112 |
+
|
| 113 |
+
<div align="center">
|
| 114 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/29b5ad12-d9a3-4255-be0a-f770923fe514>
|
| 115 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/680b4cda-c2b1-45d1-8c33-196dee1a38f3>
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
The original scores are shown below.
|
| 119 |
+
|
| 120 |
+
| L-Eval | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | vicuna-7b-v1.5-16k | xgen-7b-8k | internlm-chat-7b-8k | longchat-7b-v1.5-32k | chatglm2-6b |
|
| 121 |
+
| ----------------- | ----- | ----------------- | --------------- | ------------------ | ---------- | ------------------- | -------------------- | ----------- |
|
| 122 |
+
| coursera | 61.05 | 50 | 45.35 | 26.74 | 33.72 | 40.12 | 27.91 | 38.95 |
|
| 123 |
+
| gsm100 | 92 | 78 | 27 | 11 | 8 | 19 | 5 | 8 |
|
| 124 |
+
| quality | 81.19 | 62.87 | 44.55 | 11.39 | 33.66 | 45.54 | 29.7 | 41.09 |
|
| 125 |
+
| tpo | 72.93 | 74.72 | 56.51 | 17.47 | 44.61 | 60.59 | 17.1 | 56.51 |
|
| 126 |
+
| topic_retrieval | 100 | 79.33 | 44.67 | 24.67 | 1.33 | 0 | 25.33 | 1.33 |
|
| 127 |
+
| | | | | | | | | |
|
| 128 |
+
| financialqa | 53.49 | 50.32 | 35.41 | 44.59 | 39.28 | 25.09 | 34.07 | 17.82 |
|
| 129 |
+
| gov_report | 50.84 | 50.48 | 42.97 | 48.17 | 38.52 | 31.29 | 36.52 | 41.88 |
|
| 130 |
+
| legal_contract_qa | 31.23 | 27.97 | 34.21 | 24.25 | 21.36 | 19.28 | 13.32 | 17.59 |
|
| 131 |
+
| meeting_summ | 31.44 | 33.54 | 29.13 | 28.52 | 27.96 | 17.56 | 22.32 | 15.98 |
|
| 132 |
+
| multidocqa | 37.81 | 35.84 | 28.6 | 26.88 | 24.41 | 22.43 | 21.85 | 19.66 |
|
| 133 |
+
| narrativeqa | 25.87 | 25.73 | 18.24 | 20.58 | 16.87 | 13.81 | 16.87 | 1.16 |
|
| 134 |
+
| nq | 67.36 | 66.91 | 41.06 | 36.44 | 29.43 | 16.42 | 35.02 | 0.92 |
|
| 135 |
+
| news_summ | 34.52 | 40.41 | 32.72 | 33.98 | 26.87 | 22.48 | 30.33 | 29.51 |
|
| 136 |
+
| paper_assistant | 42.26 | 41.76 | 34.59 | 35.83 | 25.39 | 28.25 | 30.42 | 30.43 |
|
| 137 |
+
| patent_summ | 48.61 | 50.62 | 46.04 | 48.87 | 46.53 | 30.3 | 41.6 | 41.25 |
|
| 138 |
+
| review_summ | 31.98 | 33.37 | 21.88 | 29.21 | 26.85 | 16.61 | 20.02 | 19.68 |
|
| 139 |
+
| scientificqa | 49.76 | 48.32 | 31.27 | 31 | 27.43 | 33.01 | 20.98 | 13.61 |
|
| 140 |
+
| tvshow_summ | 34.84 | 31.36 | 23.97 | 27.88 | 26.6 | 14.55 | 25.09 | 19.45 |
|
| 141 |
+
|
| 142 |
+
| LongBench | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | longchat-7b-v1.5-32k | vicuna-7b-v1.5-16k | internlm-chat-7b-8k | chatglm2-6b | xgen-7b-8k |
|
| 143 |
+
| ------------------- | ----- | ----------------- | --------------- | -------------------- | ------------------ | ------------------- | ----------- | ---------- |
|
| 144 |
+
| NarrativeQA | 31.2 | 25.79 | 19.27 | 19.19 | 23.65 | 12.24 | 13.09 | 18.85 |
|
| 145 |
+
| Qasper | 42.77 | 43.4 | 33.93 | 30.36 | 31.45 | 24.81 | 22.52 | 20.18 |
|
| 146 |
+
| MultiFieldQA-en | 55.1 | 54.35 | 45.58 | 44.6 | 43.38 | 25.41 | 38.09 | 37 |
|
| 147 |
+
| MultiFieldQA-zh | 64.4 | 61.92 | 52.94 | 32.35 | 44.65 | 36.13 | 37.67 | 14.7 |
|
| 148 |
+
| | | | | | | | | |
|
| 149 |
+
| HotpotQA | 59.85 | 52.49 | 46.41 | 34.43 | 34.17 | 27.42 | 27.35 | 28.78 |
|
| 150 |
+
| 2WikiMQA | 67.52 | 41.7 | 33.63 | 23.06 | 20.45 | 26.24 | 22.83 | 20.13 |
|
| 151 |
+
| Musique | 37.53 | 27.5 | 21.57 | 12.42 | 13.92 | 9.75 | 7.26 | 11.34 |
|
| 152 |
+
| DuReader (zh) | 38.65 | 29.37 | 38.53 | 20.25 | 20.42 | 11.11 | 17.18 | 8.57 |
|
| 153 |
+
| | | | | | | | | |
|
| 154 |
+
| GovReport | 32.09 | 29.92 | 32.47 | 29.83 | 29.27 | 18.38 | 22.86 | 23.37 |
|
| 155 |
+
| QMSum | 24.37 | 23.67 | 23.19 | 22.71 | 23.37 | 18.45 | 21.23 | 21.12 |
|
| 156 |
+
| Multi_news | 28.52 | 27.05 | 25.12 | 26.1 | 27.83 | 24.52 | 24.7 | 23.69 |
|
| 157 |
+
| VCSUM (zh) | 15.54 | 16.88 | 15.95 | 13.46 | 15.76 | 12.91 | 14.07 | 0.98 |
|
| 158 |
+
| | | | | | | | | |
|
| 159 |
+
| TREC | 78.5 | 73.5 | 30.96 | 29.23 | 32.06 | 39 | 24.46 | 29.31 |
|
| 160 |
+
| TriviaQA | 92.19 | 92.75 | 80.64 | 64.19 | 46.53 | 79.55 | 64.19 | 69.58 |
|
| 161 |
+
| SAMSum | 46.32 | 43.16 | 29.49 | 25.23 | 25.23 | 43.05 | 20.22 | 16.05 |
|
| 162 |
+
| LSHT (zh) | 41.5 | 34.5 | 22.75 | 20 | 24.75 | 20.5 | 16 | 18.67 |
|
| 163 |
+
| | | | | | | | | |
|
| 164 |
+
| Passage Count | 8.5 | 3 | 3 | 1 | 3 | 1.76 | 3 | 1 |
|
| 165 |
+
| PassageRetrieval-en | 75 | 73 | 57.5 | 20.5 | 16.5 | 7 | 5.5 | 12 |
|
| 166 |
+
| PassageRetrieval-zh | 96 | 82.5 | 58 | 15 | 21 | 2.29 | 5 | 3.75 |
|
| 167 |
+
| | | | | | | | | |
|
| 168 |
+
| LCC | 59.25 | 53.49 | 53.3 | 51.46 | 49.3 | 49.32 | 46.59 | 44.1 |
|
| 169 |
+
| RepoBench-P | 55.42 | 55.95 | 46.66 | 52.18 | 41.49 | 35.86 | 41.97 | 41.83 |
|
opencompass/docs/en/advanced_guides/needleinahaystack_eval.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Needle In A Haystack Experimental Evaluation
|
| 2 |
+
|
| 3 |
+
## Introduction to the Needle In A Haystack Test
|
| 4 |
+
|
| 5 |
+
The Needle In A Haystack test (inspired by [NeedleInAHaystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/LLMNeedleHaystackTester.py)) is an evaluation method that randomly inserts key information into long texts to form prompts for large language models (LLMs). The test aims to detect whether large models can extract such key information from extensive texts, thereby assessing the models' capabilities in processing and understanding long documents.
|
| 6 |
+
|
| 7 |
+
## Task Overview
|
| 8 |
+
|
| 9 |
+
Within the `NeedleBench` framework of `OpenCompass`, we have designed a series of increasingly challenging test scenarios to comprehensively evaluate the models' abilities in long text information extraction and reasoning. For a complete introduction, refer to our [technical report](https://arxiv.org/abs/2407.11963):
|
| 10 |
+
|
| 11 |
+
- **Single-Needle Retrieval Task (S-RT)**: Assesses an LLM's ability to extract a single key piece of information from a long text, testing its precision in recalling specific details within broad narratives. This corresponds to the **original Needle In A Haystack test** setup.
|
| 12 |
+
|
| 13 |
+
- **Multi-Needle Retrieval Task (M-RT)**: Explores an LLM's capability to retrieve multiple related pieces of information from long texts, simulating real-world scenarios of complex queries on comprehensive documents.
|
| 14 |
+
|
| 15 |
+
- **Multi-Needle Reasoning Task (M-RS)**: Evaluates an LLM's long-text abilities by extracting and utilizing multiple key pieces of information, requiring the model to have a comprehensive understanding of each key information fragment.
|
| 16 |
+
|
| 17 |
+
- **Ancestral Trace Challenge (ATC)**: Uses the "relational needle" to test an LLM's ability to handle multi-layer logical challenges in real long texts. In the ATC task, a series of logical reasoning questions are used to test the model's memory and analytical skills for every detail in the text. For this task, we remove the irrelevant text (Haystack) setting, designing all texts as critical information, requiring the LLM to use all the content and reasoning in the text accurately to answer the questions.
|
| 18 |
+
|
| 19 |
+
### Evaluation Steps
|
| 20 |
+
|
| 21 |
+
> Note: In the latest code, OpenCompass has been set to automatically load the dataset from [Huggingface API](https://huggingface.co/datasets/opencompass/NeedleBench), so you can **skip directly** the following steps of manually downloading and placing the dataset.
|
| 22 |
+
|
| 23 |
+
1. Download the dataset from [here](https://github.com/open-compass/opencompass/files/14741330/needlebench.zip).
|
| 24 |
+
|
| 25 |
+
2. Place the downloaded files in the `opencompass/data/needlebench/` directory. The expected file structure in the `needlebench` directory is shown below:
|
| 26 |
+
|
| 27 |
+
```
|
| 28 |
+
opencompass/
|
| 29 |
+
├── configs
|
| 30 |
+
├── docs
|
| 31 |
+
├── data
|
| 32 |
+
│ └── needlebench
|
| 33 |
+
│ ├── multi_needle_reasoning_en.json
|
| 34 |
+
│ ├── multi_needle_reasoning_zh.json
|
| 35 |
+
│ ├── names.json
|
| 36 |
+
│ ├── needles.jsonl
|
| 37 |
+
│ ├── PaulGrahamEssays.jsonl
|
| 38 |
+
│ ├── zh_finance.jsonl
|
| 39 |
+
│ ├── zh_game.jsonl
|
| 40 |
+
│ ├── zh_government.jsonl
|
| 41 |
+
│ ├── zh_movie.jsonl
|
| 42 |
+
│ ├── zh_tech.jsonl
|
| 43 |
+
│ ├── zh_general.jsonl
|
| 44 |
+
├── LICENSE
|
| 45 |
+
├── opencompass
|
| 46 |
+
├── outputs
|
| 47 |
+
├── run.py
|
| 48 |
+
├── more...
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### `OpenCompass` Environment Setup
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
| 55 |
+
conda activate opencompass
|
| 56 |
+
git clone https://github.com/open-compass/opencompass opencompass
|
| 57 |
+
cd opencompass
|
| 58 |
+
pip install -e .
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### Configuring the Dataset
|
| 62 |
+
|
| 63 |
+
We have pre-configured datasets for common text lengths (4k, 8k, 32k, 128k, 200k, 1000k) in `configs/datasets/needlebench`, allowing you to flexibly create datasets that meet your needs by defining related parameters in the configuration files.
|
| 64 |
+
|
| 65 |
+
### Evaluation Example
|
| 66 |
+
|
| 67 |
+
#### Evaluating `InternLM2-7B` Model Deployed Using `LMDeploy`
|
| 68 |
+
|
| 69 |
+
For example, to evaluate the `InternLM2-7B` model deployed using `LMDeploy` for all tasks in NeedleBench-4K, you can directly use the following command in the command line. This command calls the pre-defined model and dataset configuration files without needing to write additional configuration files:
|
| 70 |
+
|
| 71 |
+
##### Local Evaluation
|
| 72 |
+
|
| 73 |
+
If you are evaluating the model locally, the command below will utilize all available GPUs on your machine. You can limit the GPU access for `OpenCompass` by setting the `CUDA_VISIBLE_DEVICES` environment variable. For instance, using `CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py ...` will only expose the first four GPUs to OpenCompass, ensuring that it does not use more than these four GPUs.
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
# Local Evaluation
|
| 77 |
+
python run.py --dataset needlebench_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
##### Evaluation on a Slurm Cluster
|
| 81 |
+
|
| 82 |
+
If using `Slurm`, you can add parameters such as `--slurm -p partition_name -q reserved --max-num-workers 16`, as shown below:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
# Slurm Evaluation
|
| 86 |
+
python run.py --dataset needlebench_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
##### Evaluating a Subdataset Only
|
| 90 |
+
|
| 91 |
+
If you only want to test the original NeedleInAHaystack task setup, you could change the dataset parameter to `needlebench_single_4k`, which corresponds to the single needle version of the NeedleInAHaystack test at 4k length:
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
python run.py --dataset needlebench_single_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
You can also choose to evaluate a specific subdataset, such as changing the `--datasets` parameter to `needlebench_single_4k/needlebench_zh_datasets` for testing just the Chinese version of the single needle 4K length NeedleInAHaystack task. The parameter after `/` represents the subdataset, which can be found in the dataset variable of `configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py` :
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
python run.py --dataset needlebench_single_4k/needlebench_zh_datasets --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
Be sure to install the [LMDeploy](https://github.com/InternLM/lmdeploy) tool before starting the evaluation:
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
pip install lmdeploy
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
This command initiates the evaluation process, with parameters `-p partition_name -q auto` and `--max-num-workers 16` used to specify the Slurm partition name and the maximum number of worker processes.
|
| 110 |
+
|
| 111 |
+
#### Evaluating Other `Huggingface` Models
|
| 112 |
+
|
| 113 |
+
For other models, we recommend writing an additional configuration file to modify the model's `max_seq_len` and `max_out_len` parameters so the model can receive the complete long text content, as we have prepared in the `configs/eval_needlebench.py` file. The complete content is as follows:
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
from mmengine.config import read_base
|
| 117 |
+
# We use mmengine.config to import variables from other configuration files
|
| 118 |
+
|
| 119 |
+
with read_base():
|
| 120 |
+
# from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as internlm2_chat_7b_200k
|
| 121 |
+
from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
|
| 122 |
+
|
| 123 |
+
# Evaluate needlebench_4k, adjust the configuration to use 8k, 32k, 128k, 200k, or 1000k if necessary.
|
| 124 |
+
# from .datasets.needlebench.needlebench_4k.needlebench_4k import needlebench_datasets
|
| 125 |
+
# from .summarizers.needlebench import needlebench_4k_summarizer as summarizer
|
| 126 |
+
|
| 127 |
+
# only eval original "needle in a haystack test" in needlebench_4k
|
| 128 |
+
from .datasets.needlebench.needlebench_4k.needlebench_single_4k import needlebench_zh_datasets, needlebench_en_datasets
|
| 129 |
+
from .summarizers.needlebench import needlebench_4k_summarizer as summarizer
|
| 130 |
+
|
| 131 |
+
# eval Ancestral Tracing Challenge(ATC)
|
| 132 |
+
# from .datasets.needlebench.atc.atc_choice_50 import needlebench_datasets
|
| 133 |
+
# from .summarizers.needlebench import atc_summarizer_50 as summarizer
|
| 134 |
+
|
| 135 |
+
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
|
| 136 |
+
|
| 137 |
+
for m in internlm2_chat_7b:
|
| 138 |
+
m['max_seq_len'] = 30768 # Ensure InternLM2-7B model can receive the complete long text, other models need to adjust according to their maximum sequence length support.
|
| 139 |
+
m['max_out_len'] = 2000 # Ensure that in the multi-needle recall task, the model can receive a complete response
|
| 140 |
+
|
| 141 |
+
models = internlm2_chat_7b
|
| 142 |
+
|
| 143 |
+
work_dir = './outputs/needlebench'
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
Once the test `config` file is written, we can pass the corresponding config file path through the `run.py` file in the command line, such as:
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
python run.py configs/eval_needlebench.py --slurm -p partition_name -q reserved --max-num-workers 16
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
Note, at this point, we do not need to pass in the `--dataset, --models, --summarizer` parameters, as we have already defined these configurations in the config file. You can manually adjust the `--max-num-workers` setting to adjust the number of parallel workers.
|
| 153 |
+
|
| 154 |
+
### Visualization
|
| 155 |
+
|
| 156 |
+
We have built-in result visualization into the `summarizer` implementation in the latest code version. You can find the corresponding visualizations in the plots directory of the respective output folder, eliminating the need for manual visualization of scores across various depths and lengths.
|
| 157 |
+
|
| 158 |
+
If you use this method, please add a reference:
|
| 159 |
+
|
| 160 |
+
```bibtex
|
| 161 |
+
|
| 162 |
+
@misc{li2024needlebenchllmsretrievalreasoning,
|
| 163 |
+
title={NeedleBench: Can LLMs Do Retrieval and Reasoning in 1 Million Context Window?},
|
| 164 |
+
author={Mo Li and Songyang Zhang and Yunxin Liu and Kai Chen},
|
| 165 |
+
year={2024},
|
| 166 |
+
eprint={2407.11963},
|
| 167 |
+
archivePrefix={arXiv},
|
| 168 |
+
primaryClass={cs.CL},
|
| 169 |
+
url={https://arxiv.org/abs/2407.11963},
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
@misc{2023opencompass,
|
| 173 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 174 |
+
author={OpenCompass Contributors},
|
| 175 |
+
howpublished={\url{https://github.com/open-compass/opencompass}},
|
| 176 |
+
year={2023}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
@misc{LLMTest_NeedleInAHaystack,
|
| 182 |
+
title={LLMTest Needle In A Haystack - Pressure Testing LLMs},
|
| 183 |
+
author={gkamradt},
|
| 184 |
+
year={2023},
|
| 185 |
+
howpublished={\url{https://github.com/gkamradt/LLMTest_NeedleInAHaystack}}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
@misc{wei2023skywork,
|
| 189 |
+
title={Skywork: A More Open Bilingual Foundation Model},
|
| 190 |
+
author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
|
| 191 |
+
year={2023},
|
| 192 |
+
eprint={2310.19341},
|
| 193 |
+
archivePrefix={arXiv},
|
| 194 |
+
primaryClass={cs.CL}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
```
|
opencompass/docs/en/advanced_guides/new_dataset.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Add a dataset
|
| 2 |
+
|
| 3 |
+
Although OpenCompass has already included most commonly used datasets, users need to follow the steps below to support a new dataset if wanted:
|
| 4 |
+
|
| 5 |
+
1. Add a dataset script `mydataset.py` to the `opencompass/datasets` folder. This script should include:
|
| 6 |
+
|
| 7 |
+
- The dataset and its loading method. Define a `MyDataset` class that implements the data loading method `load` as a static method. This method should return data of type `datasets.Dataset`. We use the Hugging Face dataset as the unified interface for datasets to avoid introducing additional logic. Here's an example:
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
import datasets
|
| 11 |
+
from .base import BaseDataset
|
| 12 |
+
|
| 13 |
+
class MyDataset(BaseDataset):
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
def load(**kwargs) -> datasets.Dataset:
|
| 17 |
+
pass
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
- (Optional) If the existing evaluators in OpenCompass do not meet your needs, you need to define a `MyDatasetEvaluator` class that implements the scoring method `score`. This method should take `predictions` and `references` as input and return the desired dictionary. Since a dataset may have multiple metrics, the method should return a dictionary containing the metrics and their corresponding scores. Here's an example:
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
| 24 |
+
|
| 25 |
+
class MyDatasetEvaluator(BaseEvaluator):
|
| 26 |
+
|
| 27 |
+
def score(self, predictions: List, references: List) -> dict:
|
| 28 |
+
pass
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
- (Optional) If the existing postprocessors in OpenCompass do not meet your needs, you need to define the `mydataset_postprocess` method. This method takes an input string and returns the corresponding postprocessed result string. Here's an example:
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
def mydataset_postprocess(text: str) -> str:
|
| 35 |
+
pass
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
2. After defining the dataset loading, data postprocessing, and evaluator methods, you need to add the following configurations to the configuration file:
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
from opencompass.datasets import MyDataset, MyDatasetEvaluator, mydataset_postprocess
|
| 42 |
+
|
| 43 |
+
mydataset_eval_cfg = dict(
|
| 44 |
+
evaluator=dict(type=MyDatasetEvaluator),
|
| 45 |
+
pred_postprocessor=dict(type=mydataset_postprocess))
|
| 46 |
+
|
| 47 |
+
mydataset_datasets = [
|
| 48 |
+
dict(
|
| 49 |
+
type=MyDataset,
|
| 50 |
+
...,
|
| 51 |
+
reader_cfg=...,
|
| 52 |
+
infer_cfg=...,
|
| 53 |
+
eval_cfg=mydataset_eval_cfg)
|
| 54 |
+
]
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.
|
opencompass/docs/en/advanced_guides/new_model.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Add a Model
|
| 2 |
+
|
| 3 |
+
Currently, we support HF models, some model APIs, and some third-party models.
|
| 4 |
+
|
| 5 |
+
## Adding API Models
|
| 6 |
+
|
| 7 |
+
To add a new API-based model, you need to create a new file named `mymodel_api.py` under `opencompass/models` directory. In this file, you should inherit from `BaseAPIModel` and implement the `generate` method for inference and the `get_token_len` method to calculate the length of tokens. Once you have defined the model, you can modify the corresponding configuration file.
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
from ..base_api import BaseAPIModel
|
| 11 |
+
|
| 12 |
+
class MyModelAPI(BaseAPIModel):
|
| 13 |
+
|
| 14 |
+
is_api: bool = True
|
| 15 |
+
|
| 16 |
+
def __init__(self,
|
| 17 |
+
path: str,
|
| 18 |
+
max_seq_len: int = 2048,
|
| 19 |
+
query_per_second: int = 1,
|
| 20 |
+
retry: int = 2,
|
| 21 |
+
**kwargs):
|
| 22 |
+
super().__init__(path=path,
|
| 23 |
+
max_seq_len=max_seq_len,
|
| 24 |
+
meta_template=meta_template,
|
| 25 |
+
query_per_second=query_per_second,
|
| 26 |
+
retry=retry)
|
| 27 |
+
...
|
| 28 |
+
|
| 29 |
+
def generate(
|
| 30 |
+
self,
|
| 31 |
+
inputs,
|
| 32 |
+
max_out_len: int = 512,
|
| 33 |
+
temperature: float = 0.7,
|
| 34 |
+
) -> List[str]:
|
| 35 |
+
"""Generate results given a list of inputs."""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def get_token_len(self, prompt: str) -> int:
|
| 39 |
+
"""Get lengths of the tokenized string."""
|
| 40 |
+
pass
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Adding Third-Party Models
|
| 44 |
+
|
| 45 |
+
To add a new third-party model, you need to create a new file named `mymodel.py` under `opencompass/models` directory. In this file, you should inherit from `BaseModel` and implement the `generate` method for generative inference, the `get_ppl` method for discriminative inference, and the `get_token_len` method to calculate the length of tokens. Once you have defined the model, you can modify the corresponding configuration file.
|
| 46 |
+
|
| 47 |
+
```python
|
| 48 |
+
from ..base import BaseModel
|
| 49 |
+
|
| 50 |
+
class MyModel(BaseModel):
|
| 51 |
+
|
| 52 |
+
def __init__(self,
|
| 53 |
+
pkg_root: str,
|
| 54 |
+
ckpt_path: str,
|
| 55 |
+
tokenizer_only: bool = False,
|
| 56 |
+
meta_template: Optional[Dict] = None,
|
| 57 |
+
**kwargs):
|
| 58 |
+
...
|
| 59 |
+
|
| 60 |
+
def get_token_len(self, prompt: str) -> int:
|
| 61 |
+
"""Get lengths of the tokenized strings."""
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
|
| 65 |
+
"""Generate results given a list of inputs. """
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
def get_ppl(self,
|
| 69 |
+
inputs: List[str],
|
| 70 |
+
mask_length: Optional[List[int]] = None) -> List[float]:
|
| 71 |
+
"""Get perplexity scores given a list of inputs."""
|
| 72 |
+
pass
|
| 73 |
+
```
|
opencompass/docs/en/advanced_guides/objective_judgelm_evaluation.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Using Large Models as JudgeLLM for Objective Evaluation
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
Traditional objective evaluations often rely on standard answers for reference. However, in practical applications, the predicted results of models may vary due to differences in the model's instruction-following capabilities or imperfections in post-processing functions. This can lead to incorrect extraction of answers and comparison with standard answers, resulting in potentially inaccurate evaluation outcomes. To address this issue, we have adopted a process similar to subjective evaluations by introducing JudgeLLM post-prediction to assess the consistency between model responses and standard answers. ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
|
| 6 |
+
|
| 7 |
+
Currently, all models supported by the opencompass repository can be directly used as JudgeLLM. Additionally, we are planning to support dedicated JudgeLLMs.
|
| 8 |
+
|
| 9 |
+
## Currently Supported Objective Evaluation Datasets
|
| 10 |
+
|
| 11 |
+
1. MATH ([https://github.com/hendrycks/math](https://github.com/hendrycks/math))
|
| 12 |
+
|
| 13 |
+
## Custom JudgeLLM Objective Dataset Evaluation
|
| 14 |
+
|
| 15 |
+
OpenCompass currently supports most datasets that use `GenInferencer` for inference. The specific process for custom JudgeLLM objective evaluation includes:
|
| 16 |
+
|
| 17 |
+
1. Building evaluation configurations using API models or open-source models for inference of question answers.
|
| 18 |
+
2. Employing a selected evaluation model (JudgeLLM) to assess the outputs of the model.
|
| 19 |
+
|
| 20 |
+
### Step One: Building Evaluation Configurations, Using MATH as an Example
|
| 21 |
+
|
| 22 |
+
Below is the Config for evaluating the MATH dataset with JudgeLLM, with the evaluation model being *Llama3-8b-instruct* and the JudgeLLM being *Llama3-70b-instruct*. For more detailed config settings, please refer to `configs/eval_math_llm_judge.py`. The following is a brief version of the annotations to help users understand the meaning of the configuration file.
|
| 23 |
+
|
| 24 |
+
```python
|
| 25 |
+
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
|
| 26 |
+
from mmengine.config import read_base
|
| 27 |
+
with read_base():
|
| 28 |
+
from .models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
|
| 29 |
+
from .models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
|
| 30 |
+
from .datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
|
| 31 |
+
from opencompass.datasets import math_judement_preprocess
|
| 32 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 33 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 34 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 35 |
+
from opencompass.runners import LocalRunner
|
| 36 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 37 |
+
from opencompass.tasks import OpenICLInferTask
|
| 38 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 39 |
+
from opencompass.summarizers import AllObjSummarizer
|
| 40 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 41 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ------------- Prompt Settings ----------------------------------------
|
| 45 |
+
# Evaluation template, please modify the template as needed, JudgeLLM typically uses [Yes] or [No] as the response. For the MATH dataset, the evaluation template is as follows:
|
| 46 |
+
eng_obj_prompt = """
|
| 47 |
+
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
| 48 |
+
|
| 49 |
+
Examples:
|
| 50 |
+
|
| 51 |
+
Expression 1: $2x+3$
|
| 52 |
+
Expression 2: $3+2x$
|
| 53 |
+
|
| 54 |
+
[Yes]
|
| 55 |
+
|
| 56 |
+
Expression 1: 3/2
|
| 57 |
+
Expression 2: 1.5
|
| 58 |
+
|
| 59 |
+
[Yes]
|
| 60 |
+
|
| 61 |
+
Expression 1: $x^2+2x+1$
|
| 62 |
+
Expression 2: $y^2+2y+1$
|
| 63 |
+
|
| 64 |
+
[No]
|
| 65 |
+
|
| 66 |
+
Expression 1: $x^2+2x+1$
|
| 67 |
+
Expression 2: $(x+1)^2$
|
| 68 |
+
|
| 69 |
+
[Yes]
|
| 70 |
+
|
| 71 |
+
Expression 1: 3245/5
|
| 72 |
+
Expression 2: 649
|
| 73 |
+
|
| 74 |
+
[No]
|
| 75 |
+
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
| 76 |
+
|
| 77 |
+
Expression 1: 2/(-3)
|
| 78 |
+
Expression 2: -2/3
|
| 79 |
+
|
| 80 |
+
[Yes]
|
| 81 |
+
(trivial simplifications are allowed)
|
| 82 |
+
|
| 83 |
+
Expression 1: 72 degrees
|
| 84 |
+
Expression 2: 72
|
| 85 |
+
|
| 86 |
+
[Yes]
|
| 87 |
+
(give benefit of the doubt to units)
|
| 88 |
+
|
| 89 |
+
Expression 1: 64
|
| 90 |
+
Expression 2: 64 square feet
|
| 91 |
+
|
| 92 |
+
[Yes]
|
| 93 |
+
(give benefit of the doubt to units)
|
| 94 |
+
|
| 95 |
+
Expression 1: 64
|
| 96 |
+
Expression 2:
|
| 97 |
+
|
| 98 |
+
[No]
|
| 99 |
+
(only mark as equivalent if both expressions are nonempty)
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
YOUR TASK
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
|
| 107 |
+
Expression 1: {obj_gold}
|
| 108 |
+
Expression 2: {prediction}
|
| 109 |
+
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
# ------------- Inference Phase ----------------------------------------
|
| 113 |
+
# Models to be evaluated
|
| 114 |
+
models = [*hf_llama3_8b_instruct_model]
|
| 115 |
+
# Evaluation models
|
| 116 |
+
judge_models = hf_llama3_70b_instruct_model
|
| 117 |
+
|
| 118 |
+
eng_datasets = [*math_datasets]
|
| 119 |
+
chn_datasets = []
|
| 120 |
+
datasets = eng_datasets + chn_datasets
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
for d in eng_datasets:
|
| 124 |
+
d['eval_cfg']= dict(
|
| 125 |
+
evaluator=dict(
|
| 126 |
+
type=LMEvaluator,
|
| 127 |
+
# If you need to preprocess model predictions before judging,
|
| 128 |
+
# you can specify a pred_postprocessor function here
|
| 129 |
+
pred_postprocessor=dict(type=math_judement_preprocess),
|
| 130 |
+
prompt_template=dict(
|
| 131 |
+
type=PromptTemplate,
|
| 132 |
+
template=dict(round=[
|
| 133 |
+
dict(
|
| 134 |
+
role='HUMAN',
|
| 135 |
+
prompt = eng_obj_prompt
|
| 136 |
+
),
|
| 137 |
+
]),
|
| 138 |
+
),
|
| 139 |
+
),
|
| 140 |
+
pred_role="BOT",
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
infer = dict(
|
| 144 |
+
partitioner=dict(type=SizePartitioner, max_task_size=40000),
|
| 145 |
+
runner=dict(
|
| 146 |
+
type=LocalRunner,
|
| 147 |
+
max_num_workers=256,
|
| 148 |
+
task=dict(type=OpenICLInferTask)),
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# ------------- Evaluation Configuration --------------------------------
|
| 152 |
+
eval = dict(
|
| 153 |
+
partitioner=dict(
|
| 154 |
+
type=SubjectiveSizePartitioner, max_task_size=80000, mode='singlescore', models=models, judge_models=judge_models,
|
| 155 |
+
),
|
| 156 |
+
runner=dict(type=LocalRunner,
|
| 157 |
+
max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
summarizer = dict(
|
| 161 |
+
type=AllObjSummarizer
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# Output folder
|
| 165 |
+
work_dir = 'outputs/obj_all/'
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
### Step Two: Launch Evaluation and Output Results
|
| 169 |
+
|
| 170 |
+
```shell
|
| 171 |
+
python run.py eval_math_llm_judge.py
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
This will initiate two rounds of evaluation. The first round involves model inference to obtain predicted answers to questions, and the second round involves JudgeLLM evaluating the consistency between the predicted answers and the standard answers, and scoring them.
|
| 175 |
+
|
| 176 |
+
- The results of model predictions will be saved in `output/.../timestamp/predictions/xxmodel/xxx.json`
|
| 177 |
+
- The JudgeLLM's evaluation responses will be saved in `output/.../timestamp/results/xxmodel/xxx.json`
|
| 178 |
+
- The evaluation report will be output to `output/.../timestamp/summary/timestamp/xxx.csv`
|
| 179 |
+
|
| 180 |
+
## Results
|
| 181 |
+
|
| 182 |
+
Using the Llama3-8b-instruct as the evaluation model and the Llama3-70b-instruct as the evaluator, the MATH dataset was assessed with the following results:
|
| 183 |
+
|
| 184 |
+
| Model | JudgeLLM Evaluation | Naive Evaluation |
|
| 185 |
+
| ------------------- | ------------------- | ---------------- |
|
| 186 |
+
| llama-3-8b-instruct | 27.7 | 27.8 |
|
opencompass/docs/en/advanced_guides/prompt_attack.md
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Prompt Attack
|
| 2 |
+
|
| 3 |
+
We support prompt attack following the idea of [PromptBench](https://github.com/microsoft/promptbench). The main purpose here is to evaluate the robustness of prompt instruction, which means when attack/modify the prompt to instruct the task, how well can this task perform as the original task.
|
| 4 |
+
|
| 5 |
+
## Set up environment
|
| 6 |
+
|
| 7 |
+
Some components are necessary to prompt attack experiment, therefore we need to set up environments.
|
| 8 |
+
|
| 9 |
+
```shell
|
| 10 |
+
git clone https://github.com/microsoft/promptbench.git
|
| 11 |
+
pip install textattack==0.3.8
|
| 12 |
+
export PYTHONPATH=$PYTHONPATH:promptbench/
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
## How to attack
|
| 16 |
+
|
| 17 |
+
### Add a dataset config
|
| 18 |
+
|
| 19 |
+
We will use GLUE-wnli dataset as example, most configuration settings can refer to [config.md](../user_guides/config.md) for help.
|
| 20 |
+
|
| 21 |
+
First we need support the basic dataset config, you can find the existing config files in `configs` or support your own config according to [new-dataset](./new_dataset.md)
|
| 22 |
+
|
| 23 |
+
Take the following `infer_cfg` as example, we need to define the prompt template. `adv_prompt` is the basic prompt placeholder to be attacked in the experiment. `sentence1` and `sentence2` are the input columns of this dataset. The attack will only modify the `adv_prompt` here.
|
| 24 |
+
|
| 25 |
+
Then, we should use `AttackInferencer` with `original_prompt_list` and `adv_key` to tell the inferencer where to attack and what text to be attacked.
|
| 26 |
+
|
| 27 |
+
More details can refer to `configs/datasets/promptbench/promptbench_wnli_gen_50662f.py` config file.
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
original_prompt_list = [
|
| 31 |
+
'Are the following two sentences entailment or not_entailment? Answer me with "A. entailment" or "B. not_entailment", just one word. ',
|
| 32 |
+
"Does the relationship between the given sentences represent entailment or not_entailment? Respond with 'A. entailment' or 'B. not_entailment'.",
|
| 33 |
+
...,
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
wnli_infer_cfg = dict(
|
| 37 |
+
prompt_template=dict(
|
| 38 |
+
type=PromptTemplate,
|
| 39 |
+
template=dict(round=[
|
| 40 |
+
dict(
|
| 41 |
+
role="HUMAN",
|
| 42 |
+
prompt="""{adv_prompt}
|
| 43 |
+
Sentence 1: {sentence1}
|
| 44 |
+
Sentence 2: {sentence2}
|
| 45 |
+
Answer:"""),
|
| 46 |
+
]),
|
| 47 |
+
),
|
| 48 |
+
retriever=dict(type=ZeroRetriever),
|
| 49 |
+
inferencer=dict(
|
| 50 |
+
type=AttackInferencer,
|
| 51 |
+
original_prompt_list=original_prompt_list,
|
| 52 |
+
adv_key='adv_prompt'))
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### Add a eval config
|
| 56 |
+
|
| 57 |
+
We should use `OpenICLAttackTask` here for attack task. Also `NaivePartitioner` should be used because the attack experiment will run the whole dataset repeatedly for nearly hurdurds times to search the best attack, we do not want to split the dataset for convenience.
|
| 58 |
+
|
| 59 |
+
```note
|
| 60 |
+
Please choose a small dataset(example < 1000) for attack, due to the aforementioned repeated search, otherwise the time cost is enumerous.
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
There are several other options in `attack` config:
|
| 64 |
+
|
| 65 |
+
- `attack`: attack type, available options includes `textfooler`, `textbugger`, `deepwordbug`, `bertattack`, `checklist`, `stresstest`;
|
| 66 |
+
- `query_budget`: upper boundary of queries, which means the total numbers of running the dataset;
|
| 67 |
+
- `prompt_topk`: number of topk prompt to be attacked. In most case, the original prompt list is great than 10, running the whole set is time consuming.
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
# Please run whole dataset at a time, aka use `NaivePartitioner` only
|
| 71 |
+
# Please use `OpenICLAttackTask` if want to perform attack experiment
|
| 72 |
+
infer = dict(
|
| 73 |
+
partitioner=dict(type=NaivePartitioner),
|
| 74 |
+
runner=dict(
|
| 75 |
+
type=SlurmRunner,
|
| 76 |
+
max_num_workers=8,
|
| 77 |
+
task=dict(type=OpenICLAttackTask),
|
| 78 |
+
retry=0),
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
attack = dict(
|
| 82 |
+
attack='textfooler',
|
| 83 |
+
query_budget=100,
|
| 84 |
+
prompt_topk=2,
|
| 85 |
+
)
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Run the experiment
|
| 89 |
+
|
| 90 |
+
Please use `--mode infer` when run the attack experiment, and set `PYTHONPATH` env.
|
| 91 |
+
|
| 92 |
+
```shell
|
| 93 |
+
python run.py configs/eval_attack.py --mode infer
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
All the results will be saved in `attack` folder.
|
| 97 |
+
The content includes the original prompt accuracy and the attacked prompt with dropped accuracy of `topk` prompt, for instance:
|
| 98 |
+
|
| 99 |
+
```
|
| 100 |
+
Prompt: Assess the connection between the following sentences and classify it as 'A. entailment' or 'B. not_entailment'., acc: 59.15%
|
| 101 |
+
Prompt: Does the relationship between the given sentences represent entailment or not_entailment? Respond with 'A. entailment' or 'B. not_entailment'., acc: 57.75%
|
| 102 |
+
Prompt: Analyze the two provided sentences and decide if their relationship is 'A. entailment' or 'B. not_entailment'., acc: 56.34%
|
| 103 |
+
Prompt: Identify whether the given pair of sentences demonstrates entailment or not_entailment. Answer with 'A. entailment' or 'B. not_entailment'., acc: 54.93%
|
| 104 |
+
...
|
| 105 |
+
Original prompt: Assess the connection between the following sentences and classify it as 'A. entailment' or 'B. not_entailment'.
|
| 106 |
+
Attacked prompt: b"Assess the attach between the following sentences and sorted it as 'A. entailment' or 'B. not_entailment'."
|
| 107 |
+
Original acc: 59.15%, attacked acc: 40.85%, dropped acc: 18.31%
|
| 108 |
+
```
|
opencompass/docs/en/advanced_guides/subjective_evaluation.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Subjective Evaluation Guidance
|
| 2 |
+
|
| 3 |
+
## Introduction
|
| 4 |
+
|
| 5 |
+
Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
|
| 6 |
+
|
| 7 |
+
To explore the model's subjective capabilities, we employ JudgeLLM as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
|
| 8 |
+
|
| 9 |
+
A popular evaluation method involves
|
| 10 |
+
|
| 11 |
+
- Compare Mode: comparing model responses pairwise to calculate their win rate
|
| 12 |
+
- Score Mode: another method involves calculate scores with single model response ([Chatbot Arena](https://chat.lmsys.org/)).
|
| 13 |
+
|
| 14 |
+
We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
|
| 15 |
+
|
| 16 |
+
## Currently Supported Subjective Evaluation Datasets
|
| 17 |
+
|
| 18 |
+
1. AlignBench Chinese Scoring Dataset (https://github.com/THUDM/AlignBench)
|
| 19 |
+
2. MTBench English Scoring Dataset, two-turn dialogue (https://github.com/lm-sys/FastChat)
|
| 20 |
+
3. MTBench101 English Scoring Dataset, multi-turn dialogue (https://github.com/mtbench101/mt-bench-101)
|
| 21 |
+
4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
|
| 22 |
+
5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
|
| 23 |
+
6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
|
| 24 |
+
7. Wildbench English Score and Compare Dataset(https://github.com/allenai/WildBench)
|
| 25 |
+
|
| 26 |
+
## Initiating Subjective Evaluation
|
| 27 |
+
|
| 28 |
+
Similar to existing objective evaluation methods, you can configure related settings in `configs/eval_subjective.py`.
|
| 29 |
+
|
| 30 |
+
### Basic Parameters: Specifying models, datasets, and judgemodels
|
| 31 |
+
|
| 32 |
+
Similar to objective evaluation, import the models and datasets that need to be evaluated, for example:
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
with read_base():
|
| 36 |
+
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
|
| 37 |
+
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
| 38 |
+
from .models.qwen.hf_qwen_7b import models
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
It is worth noting that since the model setup parameters for subjective evaluation are often different from those for objective evaluation, it often requires setting up `do_sample` for inference instead of `greedy`. You can modify the relevant parameters in the configuration file as needed, for example:
|
| 42 |
+
|
| 43 |
+
```
|
| 44 |
+
models = [
|
| 45 |
+
dict(
|
| 46 |
+
type=HuggingFaceChatGLM3,
|
| 47 |
+
abbr='chatglm3-6b-hf2',
|
| 48 |
+
path='THUDM/chatglm3-6b',
|
| 49 |
+
tokenizer_path='THUDM/chatglm3-6b',
|
| 50 |
+
model_kwargs=dict(
|
| 51 |
+
device_map='auto',
|
| 52 |
+
trust_remote_code=True,
|
| 53 |
+
),
|
| 54 |
+
tokenizer_kwargs=dict(
|
| 55 |
+
padding_side='left',
|
| 56 |
+
truncation_side='left',
|
| 57 |
+
trust_remote_code=True,
|
| 58 |
+
),
|
| 59 |
+
generation_kwargs=dict(
|
| 60 |
+
do_sample=True,
|
| 61 |
+
),
|
| 62 |
+
meta_template=api_meta_template,
|
| 63 |
+
max_out_len=2048,
|
| 64 |
+
max_seq_len=4096,
|
| 65 |
+
batch_size=8,
|
| 66 |
+
run_cfg=dict(num_gpus=1, num_procs=1),
|
| 67 |
+
)
|
| 68 |
+
]
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
The judgemodel is usually set to a powerful model like GPT4, and you can directly enter your API key according to the configuration in the config file, or use a custom model as the judgemodel.
|
| 72 |
+
|
| 73 |
+
### Specifying Other Parameters
|
| 74 |
+
|
| 75 |
+
In addition to the basic parameters, you can also modify the `infer` and `eval` fields in the config to set a more appropriate partitioning method. The currently supported partitioning methods mainly include three types: NaivePartitioner, SizePartitioner, and NumberWorkPartitioner. You can also specify your own workdir to save related files.
|
| 76 |
+
|
| 77 |
+
## Subjective Evaluation with Custom Dataset
|
| 78 |
+
|
| 79 |
+
The specific process includes:
|
| 80 |
+
|
| 81 |
+
1. Data preparation
|
| 82 |
+
2. Model response generation
|
| 83 |
+
3. Evaluate the response with a JudgeLLM
|
| 84 |
+
4. Generate JudgeLLM's response and calculate the metric
|
| 85 |
+
|
| 86 |
+
### Step-1: Data Preparation
|
| 87 |
+
|
| 88 |
+
This step requires preparing the dataset file and implementing your own dataset class under `Opencompass/datasets/subjective/`, returning the read data in the format of `list of dict`.
|
| 89 |
+
|
| 90 |
+
Actually, you can prepare the data in any format you like (csv, json, jsonl, etc.). However, to make it easier to get started, it is recommended to construct the data according to the format of the existing subjective datasets or according to the following json format.
|
| 91 |
+
We provide mini test-set for **Compare Mode** and **Score Mode** as below:
|
| 92 |
+
|
| 93 |
+
```python
|
| 94 |
+
###COREV2
|
| 95 |
+
[
|
| 96 |
+
{
|
| 97 |
+
"question": "如果我在空中垂直抛球,球最初向哪个方向行进?",
|
| 98 |
+
"capability": "知识-社会常识",
|
| 99 |
+
"others": {
|
| 100 |
+
"question": "如果我在空中垂直抛球,球最初向哪个方向行进?",
|
| 101 |
+
"evaluating_guidance": "",
|
| 102 |
+
"reference_answer": "上"
|
| 103 |
+
}
|
| 104 |
+
},...]
|
| 105 |
+
|
| 106 |
+
###CreationV0.1
|
| 107 |
+
[
|
| 108 |
+
{
|
| 109 |
+
"question": "请你扮演一个邮件管家,我让你给谁发送什么主题的邮件,你就帮我扩充好邮件正文,并打印在聊天框里。你需要根据我提供��邮件收件人以及邮件主题,来斟酌用词,并使用合适的敬语。现在请给导师发送邮件,询问他是否可以下周三下午15:00进行科研同步会,大约200字。",
|
| 110 |
+
"capability": "邮件通知",
|
| 111 |
+
"others": ""
|
| 112 |
+
},
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
The json must includes the following fields:
|
| 116 |
+
|
| 117 |
+
- 'question': Question description
|
| 118 |
+
- 'capability': The capability dimension of the question.
|
| 119 |
+
- 'others': Other needed information.
|
| 120 |
+
|
| 121 |
+
If you want to modify prompt on each single question, you can full some other information into 'others' and construct it.
|
| 122 |
+
|
| 123 |
+
### Step-2: Evaluation Configuration(Compare Mode)
|
| 124 |
+
|
| 125 |
+
Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`:
|
| 126 |
+
|
| 127 |
+
1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
|
| 128 |
+
2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
|
| 129 |
+
3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
|
| 130 |
+
4. Specify additional information such as `mode` at the corresponding location. Note that the fields required for different subjective datasets may vary.
|
| 131 |
+
5. Define post-processing and score statistics. For example, the postprocessing function `alignbench_postprocess` located under `opencompass/opencompass/datasets/subjective/alignbench`.
|
| 132 |
+
|
| 133 |
+
### Step-3: Launch the Evaluation
|
| 134 |
+
|
| 135 |
+
```shell
|
| 136 |
+
python run.py config/eval_subjective_score.py -r
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
|
| 140 |
+
|
| 141 |
+
The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
|
| 142 |
+
The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
|
| 143 |
+
|
| 144 |
+
## Multi-round Subjective Evaluation in OpenCompass
|
| 145 |
+
|
| 146 |
+
In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/datasets/subjective/multiround`.
|
| 147 |
+
|
| 148 |
+
In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
|
| 149 |
+
|
| 150 |
+
```
|
| 151 |
+
"dialogue": [
|
| 152 |
+
{
|
| 153 |
+
"role": "user",
|
| 154 |
+
"content": "Imagine you are participating in a race with a group of people. If you have just overtaken the second person, what's your current position? Where is the person you just overtook?"
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"role": "assistant",
|
| 158 |
+
"content": ""
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"role": "user",
|
| 162 |
+
"content": "If the \"second person\" is changed to \"last person\" in the above question, what would the answer be?"
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"role": "assistant",
|
| 166 |
+
"content": ""
|
| 167 |
+
}
|
| 168 |
+
],
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.
|
opencompass/docs/en/conf.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa
|
| 2 |
+
# Configuration file for the Sphinx documentation builder.
|
| 3 |
+
#
|
| 4 |
+
# This file only contains a selection of the most common options. For a full
|
| 5 |
+
# list see the documentation:
|
| 6 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
| 7 |
+
|
| 8 |
+
# -- Path setup --------------------------------------------------------------
|
| 9 |
+
|
| 10 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
| 11 |
+
# add these directories to sys.path here. If the directory is relative to the
|
| 12 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
| 13 |
+
#
|
| 14 |
+
import os
|
| 15 |
+
import subprocess
|
| 16 |
+
import sys
|
| 17 |
+
|
| 18 |
+
import pytorch_sphinx_theme
|
| 19 |
+
from sphinx.builders.html import StandaloneHTMLBuilder
|
| 20 |
+
|
| 21 |
+
sys.path.insert(0, os.path.abspath('../../'))
|
| 22 |
+
|
| 23 |
+
# -- Project information -----------------------------------------------------
|
| 24 |
+
|
| 25 |
+
project = 'OpenCompass'
|
| 26 |
+
copyright = '2023, OpenCompass'
|
| 27 |
+
author = 'OpenCompass Authors'
|
| 28 |
+
|
| 29 |
+
# The full version, including alpha/beta/rc tags
|
| 30 |
+
version_file = '../../opencompass/__init__.py'
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_version():
|
| 34 |
+
with open(version_file, 'r') as f:
|
| 35 |
+
exec(compile(f.read(), version_file, 'exec'))
|
| 36 |
+
return locals()['__version__']
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
release = get_version()
|
| 40 |
+
|
| 41 |
+
# -- General configuration ---------------------------------------------------
|
| 42 |
+
|
| 43 |
+
# Add any Sphinx extension module names here, as strings. They can be
|
| 44 |
+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
| 45 |
+
# ones.
|
| 46 |
+
extensions = [
|
| 47 |
+
'sphinx.ext.autodoc',
|
| 48 |
+
'sphinx.ext.autosummary',
|
| 49 |
+
'sphinx.ext.intersphinx',
|
| 50 |
+
'sphinx.ext.napoleon',
|
| 51 |
+
'sphinx.ext.viewcode',
|
| 52 |
+
'myst_parser',
|
| 53 |
+
'sphinx_copybutton',
|
| 54 |
+
'sphinx_tabs.tabs',
|
| 55 |
+
'notfound.extension',
|
| 56 |
+
'sphinxcontrib.jquery',
|
| 57 |
+
'sphinx_design',
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# Add any paths that contain templates here, relative to this directory.
|
| 61 |
+
templates_path = ['_templates']
|
| 62 |
+
|
| 63 |
+
# The suffix(es) of source filenames.
|
| 64 |
+
# You can specify multiple suffix as a list of string:
|
| 65 |
+
#
|
| 66 |
+
source_suffix = {
|
| 67 |
+
'.rst': 'restructuredtext',
|
| 68 |
+
'.md': 'markdown',
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
language = 'en'
|
| 72 |
+
|
| 73 |
+
# The master toctree document.
|
| 74 |
+
root_doc = 'index'
|
| 75 |
+
|
| 76 |
+
# List of patterns, relative to source directory, that match files and
|
| 77 |
+
# directories to ignore when looking for source files.
|
| 78 |
+
# This pattern also affects html_static_path and html_extra_path.
|
| 79 |
+
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
| 80 |
+
|
| 81 |
+
# -- Options for HTML output -------------------------------------------------
|
| 82 |
+
|
| 83 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
| 84 |
+
# a list of builtin themes.
|
| 85 |
+
#
|
| 86 |
+
html_theme = 'pytorch_sphinx_theme'
|
| 87 |
+
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
|
| 88 |
+
|
| 89 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
| 90 |
+
# further. For a list of options available for each theme, see the
|
| 91 |
+
# documentation.
|
| 92 |
+
# yapf: disable
|
| 93 |
+
html_theme_options = {
|
| 94 |
+
'menu': [
|
| 95 |
+
{
|
| 96 |
+
'name': 'GitHub',
|
| 97 |
+
'url': 'https://github.com/open-compass/opencompass'
|
| 98 |
+
},
|
| 99 |
+
],
|
| 100 |
+
# Specify the language of shared menu
|
| 101 |
+
'menu_lang': 'en',
|
| 102 |
+
# Disable the default edit on GitHub
|
| 103 |
+
'default_edit_on_github': False,
|
| 104 |
+
}
|
| 105 |
+
# yapf: enable
|
| 106 |
+
|
| 107 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
| 108 |
+
# relative to this directory. They are copied after the builtin static files,
|
| 109 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
| 110 |
+
html_static_path = ['_static']
|
| 111 |
+
html_css_files = [
|
| 112 |
+
'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
|
| 113 |
+
'css/readthedocs.css'
|
| 114 |
+
]
|
| 115 |
+
html_js_files = [
|
| 116 |
+
'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
|
| 117 |
+
'js/custom.js'
|
| 118 |
+
]
|
| 119 |
+
|
| 120 |
+
# -- Options for HTMLHelp output ---------------------------------------------
|
| 121 |
+
|
| 122 |
+
# Output file base name for HTML help builder.
|
| 123 |
+
htmlhelp_basename = 'opencompassdoc'
|
| 124 |
+
|
| 125 |
+
# -- Options for LaTeX output ------------------------------------------------
|
| 126 |
+
|
| 127 |
+
latex_elements = {
|
| 128 |
+
# The paper size ('letterpaper' or 'a4paper').
|
| 129 |
+
#
|
| 130 |
+
# 'papersize': 'letterpaper',
|
| 131 |
+
|
| 132 |
+
# The font size ('10pt', '11pt' or '12pt').
|
| 133 |
+
#
|
| 134 |
+
# 'pointsize': '10pt',
|
| 135 |
+
|
| 136 |
+
# Additional stuff for the LaTeX preamble.
|
| 137 |
+
#
|
| 138 |
+
# 'preamble': '',
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
| 142 |
+
# (source start file, target name, title,
|
| 143 |
+
# author, documentclass [howto, manual, or own class]).
|
| 144 |
+
latex_documents = [
|
| 145 |
+
(root_doc, 'opencompass.tex', 'OpenCompass Documentation', author,
|
| 146 |
+
'manual'),
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
# -- Options for manual page output ------------------------------------------
|
| 150 |
+
|
| 151 |
+
# One entry per manual page. List of tuples
|
| 152 |
+
# (source start file, name, description, authors, manual section).
|
| 153 |
+
man_pages = [(root_doc, 'opencompass', 'OpenCompass Documentation', [author],
|
| 154 |
+
1)]
|
| 155 |
+
|
| 156 |
+
# -- Options for Texinfo output ----------------------------------------------
|
| 157 |
+
|
| 158 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
| 159 |
+
# (source start file, target name, title, author,
|
| 160 |
+
# dir menu entry, description, category)
|
| 161 |
+
texinfo_documents = [
|
| 162 |
+
(root_doc, 'opencompass', 'OpenCompass Documentation', author,
|
| 163 |
+
'OpenCompass Authors', 'AGI evaluation toolbox and benchmark.',
|
| 164 |
+
'Miscellaneous'),
|
| 165 |
+
]
|
| 166 |
+
|
| 167 |
+
# -- Options for Epub output -------------------------------------------------
|
| 168 |
+
|
| 169 |
+
# Bibliographic Dublin Core info.
|
| 170 |
+
epub_title = project
|
| 171 |
+
|
| 172 |
+
# The unique identifier of the text. This can be a ISBN number
|
| 173 |
+
# or the project homepage.
|
| 174 |
+
#
|
| 175 |
+
# epub_identifier = ''
|
| 176 |
+
|
| 177 |
+
# A unique identification for the text.
|
| 178 |
+
#
|
| 179 |
+
# epub_uid = ''
|
| 180 |
+
|
| 181 |
+
# A list of files that should not be packed into the epub file.
|
| 182 |
+
epub_exclude_files = ['search.html']
|
| 183 |
+
|
| 184 |
+
# set priority when building html
|
| 185 |
+
StandaloneHTMLBuilder.supported_image_types = [
|
| 186 |
+
'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
# -- Extension configuration -------------------------------------------------
|
| 190 |
+
# Ignore >>> when copying code
|
| 191 |
+
copybutton_prompt_text = r'>>> |\.\.\. '
|
| 192 |
+
copybutton_prompt_is_regexp = True
|
| 193 |
+
|
| 194 |
+
# Auto-generated header anchors
|
| 195 |
+
myst_heading_anchors = 3
|
| 196 |
+
# Enable "colon_fence" extension of myst.
|
| 197 |
+
myst_enable_extensions = ['colon_fence', 'dollarmath']
|
| 198 |
+
|
| 199 |
+
# Configuration for intersphinx
|
| 200 |
+
intersphinx_mapping = {
|
| 201 |
+
'python': ('https://docs.python.org/3', None),
|
| 202 |
+
'numpy': ('https://numpy.org/doc/stable', None),
|
| 203 |
+
'torch': ('https://pytorch.org/docs/stable/', None),
|
| 204 |
+
'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
|
| 205 |
+
'transformers':
|
| 206 |
+
('https://huggingface.co/docs/transformers/main/en/', None),
|
| 207 |
+
}
|
| 208 |
+
napoleon_custom_sections = [
|
| 209 |
+
# Custom sections for data elements.
|
| 210 |
+
('Meta fields', 'params_style'),
|
| 211 |
+
('Data fields', 'params_style'),
|
| 212 |
+
]
|
| 213 |
+
|
| 214 |
+
# Disable docstring inheritance
|
| 215 |
+
autodoc_inherit_docstrings = False
|
| 216 |
+
# Mock some imports during generate API docs.
|
| 217 |
+
autodoc_mock_imports = ['rich', 'attr', 'einops']
|
| 218 |
+
# Disable displaying type annotations, these can be very verbose
|
| 219 |
+
autodoc_typehints = 'none'
|
| 220 |
+
|
| 221 |
+
# The not found page
|
| 222 |
+
notfound_template = '404.html'
|
opencompass/docs/en/docutils.conf
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[html writers]
|
| 2 |
+
table_style: colwidths-auto
|
opencompass/docs/en/get_started/faq.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FAQ
|
| 2 |
+
|
| 3 |
+
## General
|
| 4 |
+
|
| 5 |
+
### What are the differences and connections between `ppl` and `gen`?
|
| 6 |
+
|
| 7 |
+
`ppl` stands for perplexity, an index used to evaluate a model's language modeling capabilities. In the context of OpenCompass, it generally refers to a method of answering multiple-choice questions: given a context, the model needs to choose the most appropriate option from multiple choices. In this case, we concatenate the n options with the context to form n sequences, then calculate the model's perplexity for these n sequences. We consider the option corresponding to the sequence with the lowest perplexity as the model's reasoning result for this question. This evaluation method is simple and direct in post-processing, with high certainty.
|
| 8 |
+
|
| 9 |
+
`gen` is an abbreviation for generate. In the context of OpenCompass, it refers to the model's continuation writing result given a context as the reasoning result for a question. Generally, the string obtained from continuation writing requires a heavier post-processing process to extract reliable answers and complete the evaluation.
|
| 10 |
+
|
| 11 |
+
In terms of usage, multiple-choice questions and some multiple-choice-like questions of the base model use `ppl`, while the base model's multiple-selection and non-multiple-choice questions use `gen`. All questions of the chat model use `gen`, as many commercial API models do not expose the `ppl` interface. However, there are exceptions, such as when we want the base model to output the problem-solving process (e.g., Let's think step by step), we will also use `gen`, but the overall usage is as shown in the following table:
|
| 12 |
+
|
| 13 |
+
| | ppl | gen |
|
| 14 |
+
| ---------- | -------------- | -------------------- |
|
| 15 |
+
| Base Model | Only MCQ Tasks | Tasks Other Than MCQ |
|
| 16 |
+
| Chat Model | None | All Tasks |
|
| 17 |
+
|
| 18 |
+
Similar to `ppl`, conditional log probability (`clp`) calculates the probability of the next token given a context. It is also only applicable to multiple-choice questions, and the range of probability calculation is limited to the tokens corresponding to the option numbers. The option corresponding to the token with the highest probability is considered the model's reasoning result. Compared to `ppl`, `clp` calculation is more efficient, requiring only one inference, whereas `ppl` requires n inferences. However, the drawback is that `clp` is subject to the tokenizer. For example, the presence or absence of space symbols before and after an option can change the tokenizer's encoding result, leading to unreliable test results. Therefore, `clp` is rarely used in OpenCompass.
|
| 19 |
+
|
| 20 |
+
### How does OpenCompass control the number of shots in few-shot evaluations?
|
| 21 |
+
|
| 22 |
+
In the dataset configuration file, there is a retriever field indicating how to recall samples from the dataset as context examples. The most commonly used is `FixKRetriever`, which means using a fixed k samples, hence k-shot. There is also `ZeroRetriever`, which means not using any samples, which in most cases implies 0-shot.
|
| 23 |
+
|
| 24 |
+
On the other hand, in-context samples can also be directly specified in the dataset template. In this case, `ZeroRetriever` is also used, but the evaluation is not 0-shot and needs to be determined based on the specific template. Refer to [prompt](../prompt/prompt_template.md) for more details
|
| 25 |
+
|
| 26 |
+
### How does OpenCompass allocate GPUs?
|
| 27 |
+
|
| 28 |
+
OpenCompass processes evaluation requests using the unit termed as "task". Each task is an independent combination of model(s) and dataset(s). The GPU resources needed for a task are determined entirely by the model being evaluated, specifically by the `num_gpus` parameter.
|
| 29 |
+
|
| 30 |
+
During evaluation, OpenCompass deploys multiple workers to execute tasks in parallel. These workers continuously try to secure GPU resources and run tasks until they succeed. As a result, OpenCompass always strives to leverage all available GPU resources to their maximum capacity.
|
| 31 |
+
|
| 32 |
+
For instance, if you're using OpenCompass on a local machine equipped with 8 GPUs, and each task demands 4 GPUs, then by default, OpenCompass will employ all 8 GPUs to concurrently run 2 tasks. However, if you adjust the `--max-num-workers` setting to 1, then only one task will be processed at a time, utilizing just 4 GPUs.
|
| 33 |
+
|
| 34 |
+
### Why doesn't the GPU behavior of HuggingFace models align with my expectations?
|
| 35 |
+
|
| 36 |
+
This is a complex issue that needs to be explained from both the supply and demand sides:
|
| 37 |
+
|
| 38 |
+
The supply side refers to how many tasks are being run. A task is a combination of a model and a dataset, and it primarily depends on how many models and datasets need to be tested. Additionally, since OpenCompass splits a larger task into multiple smaller tasks, the number of data entries per sub-task (`--max-partition-size`) also affects the number of tasks. (The `--max-partition-size` is proportional to the actual number of data entries, but the relationship is not 1:1).
|
| 39 |
+
|
| 40 |
+
The demand side refers to how many workers are running. Since OpenCompass instantiates multiple models for inference simultaneously, we use `--hf-num-gpus` to specify how many GPUs each instance uses. Note that `--hf-num-gpus` is a parameter specific to HuggingFace models and setting this parameter for non-HuggingFace models will not have any effect. We also use `--max-num-workers` to indicate the maximum number of instances running at the same time. Lastly, due to issues like GPU memory and insufficient load, OpenCompass also supports running multiple instances on the same GPU, which is managed by the parameter `--max-num-workers-per-gpu`. Therefore, it can be generally assumed that we will use a total of `--hf-num-gpus` * `--max-num-workers` / `--max-num-workers-per-gpu` GPUs.
|
| 41 |
+
|
| 42 |
+
In summary, when tasks run slowly or the GPU load is low, we first need to check if the supply is sufficient. If not, consider reducing `--max-partition-size` to split the tasks into finer parts. Next, we need to check if the demand is sufficient. If not, consider increasing `--max-num-workers` and `--max-num-workers-per-gpu`. Generally, **we set `--hf-num-gpus` to the minimum value that meets the demand and do not adjust it further.**
|
| 43 |
+
|
| 44 |
+
### How do I control the number of GPUs that OpenCompass occupies?
|
| 45 |
+
|
| 46 |
+
Currently, there isn't a direct method to specify the number of GPUs OpenCompass can utilize. However, the following are some indirect strategies:
|
| 47 |
+
|
| 48 |
+
**If evaluating locally:**
|
| 49 |
+
You can limit OpenCompass's GPU access by setting the `CUDA_VISIBLE_DEVICES` environment variable. For instance, using `CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py ...` will only expose the first four GPUs to OpenCompass, ensuring it uses no more than these four GPUs simultaneously.
|
| 50 |
+
|
| 51 |
+
**If using Slurm or DLC:**
|
| 52 |
+
Although OpenCompass doesn't have direct access to the resource pool, you can adjust the `--max-num-workers` parameter to restrict the number of evaluation tasks being submitted simultaneously. This will indirectly manage the number of GPUs that OpenCompass employs. For instance, if each task requires 4 GPUs, and you wish to allocate a total of 8 GPUs, then you should set `--max-num-workers` to 2.
|
| 53 |
+
|
| 54 |
+
### `libGL.so.1` not foune
|
| 55 |
+
|
| 56 |
+
opencv-python depends on some dynamic libraries that are not present in the environment. The simplest solution is to uninstall opencv-python and then install opencv-python-headless.
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
pip uninstall opencv-python
|
| 60 |
+
pip install opencv-python-headless
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
Alternatively, you can install the corresponding dependency libraries according to the error message
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
sudo apt-get update
|
| 67 |
+
sudo apt-get install -y libgl1 libglib2.0-0
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Network
|
| 71 |
+
|
| 72 |
+
### My tasks failed with error: `('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))` or `urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443)`
|
| 73 |
+
|
| 74 |
+
Because of HuggingFace's implementation, OpenCompass requires network (especially the connection to HuggingFace) for the first time it loads some datasets and models. Additionally, it connects to HuggingFace each time it is launched. For a successful run, you may:
|
| 75 |
+
|
| 76 |
+
- Work behind a proxy by specifying the environment variables `http_proxy` and `https_proxy`;
|
| 77 |
+
- Use the cache files from other machines. You may first run the experiment on a machine that has access to the Internet, and then copy the cached files to the offline one. The cached files are located at `~/.cache/huggingface/` by default ([doc](https://huggingface.co/docs/datasets/cache#cache-directory)). When the cached files are ready, you can start the evaluation in offline mode:
|
| 78 |
+
```python
|
| 79 |
+
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 HF_EVALUATE_OFFLINE=1 python run.py ...
|
| 80 |
+
```
|
| 81 |
+
With which no more network connection is needed for the evaluation. However, error will still be raised if the files any dataset or model is missing from the cache.
|
| 82 |
+
- Use mirror like [hf-mirror](https://hf-mirror.com/)
|
| 83 |
+
```python
|
| 84 |
+
HF_ENDPOINT=https://hf-mirror.com python run.py ...
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### My server cannot connect to the Internet, how can I use OpenCompass?
|
| 88 |
+
|
| 89 |
+
Use the cache files from other machines, as suggested in the answer to [Network-Q1](#my-tasks-failed-with-error-connection-aborted-connectionreseterror104-connection-reset-by-peer-or-urllib3exceptionsmaxretryerror-httpsconnectionpoolhostcdn-lfshuggingfaceco-port443).
|
| 90 |
+
|
| 91 |
+
### In evaluation phase, I'm running into an error saying that `FileNotFoundError: Couldn't find a module script at opencompass/accuracy.py. Module 'accuracy' doesn't exist on the Hugging Face Hub either.`
|
| 92 |
+
|
| 93 |
+
HuggingFace tries to load the metric (e.g. `accuracy`) as an module online, and it could fail if the network is unreachable. Please refer to [Network-Q1](#my-tasks-failed-with-error-connection-aborted-connectionreseterror104-connection-reset-by-peer-or-urllib3exceptionsmaxretryerror-httpsconnectionpoolhostcdn-lfshuggingfaceco-port443) for guidelines to fix your network issue.
|
| 94 |
+
|
| 95 |
+
The issue has been fixed in the latest version of OpenCompass, so you might also consider pull from the latest version.
|
| 96 |
+
|
| 97 |
+
## Efficiency
|
| 98 |
+
|
| 99 |
+
### Why does OpenCompass partition each evaluation request into tasks?
|
| 100 |
+
|
| 101 |
+
Given the extensive evaluation time and the vast quantity of datasets, conducting a comprehensive linear evaluation on LLM models can be immensely time-consuming. To address this, OpenCompass divides the evaluation request into multiple independent "tasks". These tasks are then dispatched to various GPU groups or nodes, achieving full parallelism and maximizing the efficiency of computational resources.
|
| 102 |
+
|
| 103 |
+
### How does task partitioning work?
|
| 104 |
+
|
| 105 |
+
Each task in OpenCompass represents a combination of specific model(s) and portions of the dataset awaiting evaluation. OpenCompass offers a variety of task partitioning strategies, each tailored for different scenarios. During the inference stage, the prevalent partitioning method seeks to balance task size, or computational cost. This cost is heuristically derived from the dataset size and the type of inference.
|
| 106 |
+
|
| 107 |
+
### Why does it take more time to evaluate LLM models on OpenCompass?
|
| 108 |
+
|
| 109 |
+
There is a tradeoff between the number of tasks and the time to load the model. For example, if we partition an request that evaluates a model against a dataset into 100 tasks, the model will be loaded 100 times in total. When resources are abundant, these 100 tasks can be executed in parallel, so the additional time spent on model loading can be ignored. However, if resources are limited, these 100 tasks will operate more sequentially, and repeated loadings can become a bottleneck in execution time.
|
| 110 |
+
|
| 111 |
+
Hence, if users find that the number of tasks greatly exceeds the available GPUs, we advise setting the `--max-partition-size` to a larger value.
|
| 112 |
+
|
| 113 |
+
## Model
|
| 114 |
+
|
| 115 |
+
### How to use the downloaded huggingface models?
|
| 116 |
+
|
| 117 |
+
If you have already download the checkpoints of the model, you can specify the local path of the model. For example
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
python run.py --datasets siqa_gen winograd_ppl --hf-type base --hf-path /path/to/model
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
## Dataset
|
| 124 |
+
|
| 125 |
+
### How to build a new dataset?
|
| 126 |
+
|
| 127 |
+
- For building new objective dataset: [new_dataset](../advanced_guides/new_dataset.md)
|
| 128 |
+
- For building new subjective dataset: [subjective_evaluation](../advanced_guides/subjective_evaluation.md)
|
opencompass/docs/en/get_started/installation.md
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Installation
|
| 2 |
+
|
| 3 |
+
## Basic Installation
|
| 4 |
+
|
| 5 |
+
1. Prepare the OpenCompass runtime environment using Conda:
|
| 6 |
+
|
| 7 |
+
```conda create --name opencompass python=3.10 -y
|
| 8 |
+
# conda create --name opencompass_lmdeploy python=3.10 -y
|
| 9 |
+
|
| 10 |
+
conda activate opencompass
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
If you want to customize the PyTorch version or related CUDA version, please refer to the [official documentation](https://pytorch.org/get-started/locally/) to set up the PyTorch environment. Note that OpenCompass requires `pytorch>=1.13`.
|
| 14 |
+
|
| 15 |
+
2. Install OpenCompass:
|
| 16 |
+
- pip Installation
|
| 17 |
+
```bash
|
| 18 |
+
# For support of most datasets and models
|
| 19 |
+
pip install -U opencompass
|
| 20 |
+
|
| 21 |
+
# Complete installation (supports more datasets)
|
| 22 |
+
# pip install "opencompass[full]"
|
| 23 |
+
|
| 24 |
+
# API Testing (e.g., OpenAI, Qwen)
|
| 25 |
+
# pip install "opencompass[api]"
|
| 26 |
+
```
|
| 27 |
+
- Building from Source Code If you want to use the latest features of OpenCompass
|
| 28 |
+
```bash
|
| 29 |
+
git clone https://github.com/open-compass/opencompass opencompass
|
| 30 |
+
cd opencompass
|
| 31 |
+
pip install -e .
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Other Installations
|
| 35 |
+
|
| 36 |
+
### Inference Backends
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
# Model inference backends. Since these backends often have dependency conflicts,
|
| 40 |
+
# we recommend using separate virtual environments to manage them.
|
| 41 |
+
pip install "opencompass[lmdeploy]"
|
| 42 |
+
# pip install "opencompass[vllm]"
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
- LMDeploy
|
| 46 |
+
|
| 47 |
+
You can check if the inference backend has been installed successfully with the following command. For more information, refer to the [official documentation](https://lmdeploy.readthedocs.io/en/latest/get_started.html)
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
lmdeploy chat internlm/internlm2_5-1_8b-chat --backend turbomind
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
- vLLM
|
| 54 |
+
|
| 55 |
+
You can check if the inference backend has been installed successfully with the following command. For more information, refer to the [official documentation](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
vllm serve facebook/opt-125m
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### API
|
| 62 |
+
|
| 63 |
+
OpenCompass supports different commercial model API calls, which you can install via pip or by referring to the [API dependencies](https://github.com/open-compass/opencompass/blob/main/requirements/api.txt) for specific API model dependencies.
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
pip install opencompass[api]
|
| 67 |
+
|
| 68 |
+
# pip install openai # GPT-3.5-Turbo / GPT-4-Turbo / GPT-4 / GPT-4o (API)
|
| 69 |
+
# pip install anthropic # Claude (API)
|
| 70 |
+
# pip install dashscope # Qwen (API)
|
| 71 |
+
# pip install volcengine-python-sdk # ByteDance Volcano Engine (API)
|
| 72 |
+
# ...
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
### Datasets
|
| 76 |
+
|
| 77 |
+
The basic installation supports most fundamental datasets. For certain datasets (e.g., Alpaca-eval, Longbench, etc.), additional dependencies need to be installed.
|
| 78 |
+
|
| 79 |
+
You can install these through pip or refer to the [additional dependencies](<(https://github.com/open-compass/opencompass/blob/main/requirements/extra.txt)>) for specific dependencies.
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
pip install opencompass[full]
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
For HumanEvalX / HumanEval+ / MBPP+, you need to manually clone the Git repository and install it.
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
git clone --recurse-submodules git@github.com:open-compass/human-eval.git
|
| 89 |
+
cd human-eval
|
| 90 |
+
pip install -e .
|
| 91 |
+
pip install -e evalplus
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Some agent evaluations require installing numerous dependencies, which may conflict with existing runtime environments. We recommend creating separate conda environments to manage these.
|
| 95 |
+
|
| 96 |
+
```bash
|
| 97 |
+
# T-Eval
|
| 98 |
+
pip install lagent==0.1.2
|
| 99 |
+
# CIBench
|
| 100 |
+
pip install -r requirements/agent.txt
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
# Dataset Preparation
|
| 104 |
+
|
| 105 |
+
The datasets supported by OpenCompass mainly include three parts:
|
| 106 |
+
|
| 107 |
+
1. Huggingface datasets: The [Huggingface Datasets](https://huggingface.co/datasets) provide a large number of datasets, which will **automatically download** when running with this option.
|
| 108 |
+
Translate the paragraph into English:
|
| 109 |
+
|
| 110 |
+
2. ModelScope Datasets: [ModelScope OpenCompass Dataset](https://modelscope.cn/organization/opencompass) supports automatic downloading of datasets from ModelScope.
|
| 111 |
+
|
| 112 |
+
To enable this feature, set the environment variable: `export DATASET_SOURCE=ModelScope`. The available datasets include (sourced from OpenCompassData-core.zip):
|
| 113 |
+
|
| 114 |
+
```plain
|
| 115 |
+
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
3. Custom dataset: OpenCompass also provides some Chinese custom **self-built** datasets. Please run the following command to **manually download and extract** them.
|
| 119 |
+
|
| 120 |
+
Run the following commands to download and place the datasets in the `${OpenCompass}/data` directory can complete dataset preparation.
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
# Run in the OpenCompass directory
|
| 124 |
+
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
|
| 125 |
+
unzip OpenCompassData-core-20240207.zip
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
If you need to use the more comprehensive dataset (~500M) provided by OpenCompass, You can download and `unzip` it using the following command:
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-complete-20240207.zip
|
| 132 |
+
unzip OpenCompassData-complete-20240207.zip
|
| 133 |
+
cd ./data
|
| 134 |
+
find . -name "*.zip" -exec unzip "{}" \;
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
The list of datasets included in both `.zip` can be found [here](https://github.com/open-compass/opencompass/releases/tag/0.2.2.rc1)
|
| 138 |
+
|
| 139 |
+
OpenCompass has supported most of the datasets commonly used for performance comparison, please refer to `configs/dataset` for the specific list of supported datasets.
|
| 140 |
+
|
| 141 |
+
For next step, please read [Quick Start](./quick_start.md).
|
opencompass/docs/en/get_started/quick_start.md
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quick Start
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
OpenCompass provides a streamlined workflow for evaluating a model, which consists of the following stages: **Configure** -> **Inference** -> **Evaluation** -> **Visualization**.
|
| 8 |
+
|
| 9 |
+
**Configure**: This is your starting point. Here, you'll set up the entire evaluation process, choosing the model(s) and dataset(s) to assess. You also have the option to select an evaluation strategy, the computation backend, and define how you'd like the results displayed.
|
| 10 |
+
|
| 11 |
+
**Inference & Evaluation**: OpenCompass efficiently manages the heavy lifting, conducting parallel inference and evaluation on your chosen model(s) and dataset(s). The **Inference** phase is all about producing outputs from your datasets, whereas the **Evaluation** phase measures how well these outputs align with the gold standard answers. While this procedure is broken down into multiple "tasks" that run concurrently for greater efficiency, be aware that working with limited computational resources might introduce some unexpected overheads, and resulting in generally slower evaluation. To understand this issue and know how to solve it, check out [FAQ: Efficiency](faq.md#efficiency).
|
| 12 |
+
|
| 13 |
+
**Visualization**: Once the evaluation is done, OpenCompass collates the results into an easy-to-read table and saves them as both CSV and TXT files. If you need real-time updates, you can activate lark reporting and get immediate status reports in your Lark clients.
|
| 14 |
+
|
| 15 |
+
Coming up, we'll walk you through the basics of OpenCompass, showcasing evaluations of pretrained models [OPT-125M](https://huggingface.co/facebook/opt-125m) and [OPT-350M](https://huggingface.co/facebook/opt-350m) on the [SIQA](https://huggingface.co/datasets/social_i_qa) and [Winograd](https://huggingface.co/datasets/winograd_wsc) benchmark tasks. Their configuration files can be found at [configs/eval_demo.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_demo.py).
|
| 16 |
+
|
| 17 |
+
Before running this experiment, please make sure you have installed OpenCompass locally and it should run successfully under one _GTX-1660-6G_ GPU.
|
| 18 |
+
For larger parameterized models like Llama-7B, refer to other examples provided in the [configs directory](https://github.com/open-compass/opencompass/tree/main/configs).
|
| 19 |
+
|
| 20 |
+
## Configuring an Evaluation Task
|
| 21 |
+
|
| 22 |
+
In OpenCompass, each evaluation task consists of the model to be evaluated and the dataset. The entry point for evaluation is `run.py`. Users can select the model and dataset to be tested either via command line or configuration files.
|
| 23 |
+
|
| 24 |
+
`````{tabs}
|
| 25 |
+
````{tab} Command Line (Custom HF Model)
|
| 26 |
+
|
| 27 |
+
For HuggingFace models, users can set model parameters directly through the command line without additional configuration files. For instance, for the `facebook/opt-125m` model, you can evaluate it with the following command:
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
python run.py --datasets siqa_gen winograd_ppl \
|
| 31 |
+
--hf-type base \
|
| 32 |
+
--hf-path facebook/opt-125m
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
Note that in this way, OpenCompass only evaluates one model at a time, while other ways can evaluate multiple models at once.
|
| 36 |
+
|
| 37 |
+
```{caution}
|
| 38 |
+
`--hf-num-gpus` does not stand for the actual number of GPUs to use in evaluation, but the minimum required number of GPUs for this model. [More](faq.md#how-does-opencompass-allocate-gpus)
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
:::{dropdown} More detailed example
|
| 42 |
+
:animate: fade-in-slide-down
|
| 43 |
+
```bash
|
| 44 |
+
python run.py --datasets siqa_gen winograd_ppl \
|
| 45 |
+
--hf-type base \ # HuggingFace model type, base or chat
|
| 46 |
+
--hf-path facebook/opt-125m \ # HuggingFace model path
|
| 47 |
+
--tokenizer-path facebook/opt-125m \ # HuggingFace tokenizer path (if the same as the model path, can be omitted)
|
| 48 |
+
--tokenizer-kwargs padding_side='left' truncation='left' trust_remote_code=True \ # Arguments to construct the tokenizer
|
| 49 |
+
--model-kwargs device_map='auto' \ # Arguments to construct the model
|
| 50 |
+
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
| 51 |
+
--max-out-len 100 \ # Maximum number of tokens to generate
|
| 52 |
+
--min-out-len 100 \ # Minimum number of tokens to generate
|
| 53 |
+
--batch-size 64 \ # Batch size
|
| 54 |
+
--hf-num-gpus 1 # Number of GPUs required to run the model
|
| 55 |
+
```
|
| 56 |
+
```{seealso}
|
| 57 |
+
For all HuggingFace related parameters supported by `run.py`, please read [Launching Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task).
|
| 58 |
+
```
|
| 59 |
+
:::
|
| 60 |
+
|
| 61 |
+
````
|
| 62 |
+
````{tab} Command Line
|
| 63 |
+
|
| 64 |
+
Users can combine the models and datasets they want to test using `--models` and `--datasets`.
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
python run.py --models hf_opt_125m hf_opt_350m --datasets siqa_gen winograd_ppl
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
The models and datasets are pre-stored in the form of configuration files in `configs/models` and `configs/datasets`. Users can view or filter the currently available model and dataset configurations using `tools/list_configs.py`.
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
# List all configurations
|
| 74 |
+
python tools/list_configs.py
|
| 75 |
+
# List all configurations related to llama and mmlu
|
| 76 |
+
python tools/list_configs.py llama mmlu
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
:::{dropdown} More about `list_configs`
|
| 80 |
+
:animate: fade-in-slide-down
|
| 81 |
+
|
| 82 |
+
Running `python tools/list_configs.py llama mmlu` gives the output like:
|
| 83 |
+
|
| 84 |
+
```text
|
| 85 |
+
+-----------------+-----------------------------------+
|
| 86 |
+
| Model | Config Path |
|
| 87 |
+
|-----------------+-----------------------------------|
|
| 88 |
+
| hf_llama2_13b | configs/models/hf_llama2_13b.py |
|
| 89 |
+
| hf_llama2_70b | configs/models/hf_llama2_70b.py |
|
| 90 |
+
| ... | ... |
|
| 91 |
+
+-----------------+-----------------------------------+
|
| 92 |
+
+-------------------+---------------------------------------------------+
|
| 93 |
+
| Dataset | Config Path |
|
| 94 |
+
|-------------------+---------------------------------------------------|
|
| 95 |
+
| cmmlu_gen | configs/datasets/cmmlu/cmmlu_gen.py |
|
| 96 |
+
| cmmlu_gen_ffe7c0 | configs/datasets/cmmlu/cmmlu_gen_ffe7c0.py |
|
| 97 |
+
| ... | ... |
|
| 98 |
+
+-------------------+---------------------------------------------------+
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
Users can use the names in the first column as input parameters for `--models` and `--datasets` in `python run.py`. For datasets, the same name with different suffixes generally indicates that its prompts or evaluation methods are different.
|
| 102 |
+
:::
|
| 103 |
+
|
| 104 |
+
:::{dropdown} Model not on the list?
|
| 105 |
+
:animate: fade-in-slide-down
|
| 106 |
+
|
| 107 |
+
If you want to evaluate other models, please check out the "Command Line (Custom HF Model)" tab for the way to construct a custom HF model without a configuration file, or "Configuration File" tab to learn the general way to prepare your model configurations.
|
| 108 |
+
|
| 109 |
+
:::
|
| 110 |
+
|
| 111 |
+
````
|
| 112 |
+
|
| 113 |
+
````{tab} Configuration File
|
| 114 |
+
|
| 115 |
+
In addition to configuring the experiment through the command line, OpenCompass also allows users to write the full configuration of the experiment in a configuration file and run it directly through `run.py`. The configuration file is organized in Python format and must include the `datasets` and `models` fields.
|
| 116 |
+
|
| 117 |
+
The test configuration for this time is [configs/eval_demo.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_demo.py). This configuration introduces the required dataset and model configurations through the [inheritance mechanism](../user_guides/config.md#inheritance-mechanism) and combines the `datasets` and `models` fields in the required format.
|
| 118 |
+
|
| 119 |
+
```python
|
| 120 |
+
from mmengine.config import read_base
|
| 121 |
+
|
| 122 |
+
with read_base():
|
| 123 |
+
from .datasets.siqa.siqa_gen import siqa_datasets
|
| 124 |
+
from .datasets.winograd.winograd_ppl import winograd_datasets
|
| 125 |
+
from .models.opt.hf_opt_125m import opt125m
|
| 126 |
+
from .models.opt.hf_opt_350m import opt350m
|
| 127 |
+
|
| 128 |
+
datasets = [*siqa_datasets, *winograd_datasets]
|
| 129 |
+
models = [opt125m, opt350m]
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
When running tasks, we just need to pass the path of the configuration file to `run.py`:
|
| 133 |
+
|
| 134 |
+
```bash
|
| 135 |
+
python run.py configs/eval_demo.py
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
:::{dropdown} More about `models`
|
| 139 |
+
:animate: fade-in-slide-down
|
| 140 |
+
|
| 141 |
+
OpenCompass provides a series of pre-defined model configurations under `configs/models`. Below is the configuration snippet related to [opt-350m](https://github.com/open-compass/opencompass/blob/main/configs/models/opt/hf_opt_350m.py) (`configs/models/opt/hf_opt_350m.py`):
|
| 142 |
+
|
| 143 |
+
```python
|
| 144 |
+
# Evaluate models supported by HuggingFace's `AutoModelForCausalLM` using `HuggingFaceBaseModel`
|
| 145 |
+
from opencompass.models import HuggingFaceBaseModel
|
| 146 |
+
|
| 147 |
+
models = [
|
| 148 |
+
# OPT-350M
|
| 149 |
+
dict(
|
| 150 |
+
type=HuggingFaceBaseModel,
|
| 151 |
+
# Initialization parameters for `HuggingFaceBaseModel`
|
| 152 |
+
path='facebook/opt-350m',
|
| 153 |
+
# Below are common parameters for all models, not specific to HuggingFaceBaseModel
|
| 154 |
+
abbr='opt-350m-hf', # Model abbreviation
|
| 155 |
+
max_out_len=1024, # Maximum number of generated tokens
|
| 156 |
+
batch_size=32, # Batch size
|
| 157 |
+
run_cfg=dict(num_gpus=1), # The required GPU numbers for this model
|
| 158 |
+
)
|
| 159 |
+
]
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
When using configurations, we can specify the relevant files through the command-line argument ` --models` or import the model configurations into the `models` list in the configuration file using the inheritance mechanism.
|
| 163 |
+
|
| 164 |
+
```{seealso}
|
| 165 |
+
More information about model configuration can be found in [Prepare Models](../user_guides/models.md).
|
| 166 |
+
```
|
| 167 |
+
:::
|
| 168 |
+
|
| 169 |
+
:::{dropdown} More about `datasets`
|
| 170 |
+
:animate: fade-in-slide-down
|
| 171 |
+
|
| 172 |
+
Similar to models, dataset configuration files are provided under `configs/datasets`. Users can use `--datasets` in the command line or import related configurations in the configuration file via inheritance
|
| 173 |
+
|
| 174 |
+
Below is a dataset-related configuration snippet from `configs/eval_demo.py`:
|
| 175 |
+
|
| 176 |
+
```python
|
| 177 |
+
from mmengine.config import read_base # Use mmengine.read_base() to read the base configuration
|
| 178 |
+
|
| 179 |
+
with read_base():
|
| 180 |
+
# Directly read the required dataset configurations from the preset dataset configurations
|
| 181 |
+
from .datasets.winograd.winograd_ppl import winograd_datasets # Read Winograd configuration, evaluated based on PPL (perplexity)
|
| 182 |
+
from .datasets.siqa.siqa_gen import siqa_datasets # Read SIQA configuration, evaluated based on generation
|
| 183 |
+
|
| 184 |
+
datasets = [*siqa_datasets, *winograd_datasets] # The final config needs to contain the required evaluation dataset list 'datasets'
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
Dataset configurations are typically of two types: 'ppl' and 'gen', indicating the evaluation method used. Where `ppl` means discriminative evaluation and `gen` means generative evaluation.
|
| 188 |
+
|
| 189 |
+
Moreover, [configs/datasets/collections](https://github.com/open-compass/opencompass/blob/main/configs/datasets/collections) houses various dataset collections, making it convenient for comprehensive evaluations. OpenCompass often uses [`base_medium.py`](/configs/datasets/collections/base_medium.py) for full-scale model testing. To replicate results, simply import that file, for example:
|
| 190 |
+
|
| 191 |
+
```bash
|
| 192 |
+
python run.py --models hf_llama_7b --datasets base_medium
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
```{seealso}
|
| 196 |
+
You can find more information from [Dataset Preparation](../user_guides/datasets.md).
|
| 197 |
+
```
|
| 198 |
+
:::
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
````
|
| 202 |
+
|
| 203 |
+
`````
|
| 204 |
+
|
| 205 |
+
```{warning}
|
| 206 |
+
OpenCompass usually assumes network is available. If you encounter network issues or wish to run OpenCompass in an offline environment, please refer to [FAQ - Network - Q1](./faq.md#network) for solutions.
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
The following sections will use configuration-based method as an example to explain the other features.
|
| 210 |
+
|
| 211 |
+
## Launching Evaluation
|
| 212 |
+
|
| 213 |
+
Since OpenCompass launches evaluation processes in parallel by default, we can start the evaluation in `--debug` mode for the first run and check if there is any problem. In `--debug` mode, the tasks will be executed sequentially and output will be printed in real time.
|
| 214 |
+
|
| 215 |
+
```bash
|
| 216 |
+
python run.py configs/eval_demo.py -w outputs/demo --debug
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
The pretrained models 'facebook/opt-350m' and 'facebook/opt-125m' will be automatically downloaded from HuggingFace during the first run.
|
| 220 |
+
If everything is fine, you should see "Starting inference process" on screen:
|
| 221 |
+
|
| 222 |
+
```bash
|
| 223 |
+
[2023-07-12 18:23:55,076] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
Then you can press `ctrl+c` to interrupt the program, and run the following command in normal mode:
|
| 227 |
+
|
| 228 |
+
```bash
|
| 229 |
+
python run.py configs/eval_demo.py -w outputs/demo
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
In normal mode, the evaluation tasks will be executed parallelly in the background, and their output will be redirected to the output directory `outputs/demo/{TIMESTAMP}`. The progress bar on the frontend only indicates the number of completed tasks, regardless of their success or failure. **Any backend task failures will only trigger a warning message in the terminal.**
|
| 233 |
+
|
| 234 |
+
:::{dropdown} More parameters in `run.py`
|
| 235 |
+
:animate: fade-in-slide-down
|
| 236 |
+
Here are some parameters related to evaluation that can help you configure more efficient inference tasks based on your environment:
|
| 237 |
+
|
| 238 |
+
- `-w outputs/demo`: Work directory to save evaluation logs and results. In this case, the experiment result will be saved to `outputs/demo/{TIMESTAMP}`.
|
| 239 |
+
- `-r`: Reuse existing inference results, and skip the finished tasks. If followed by a timestamp, the result under that timestamp in the workspace path will be reused; otherwise, the latest result in the specified workspace path will be reused.
|
| 240 |
+
- `--mode all`: Specify a specific stage of the task.
|
| 241 |
+
- all: (Default) Perform a complete evaluation, including inference and evaluation.
|
| 242 |
+
- infer: Perform inference on each dataset.
|
| 243 |
+
- eval: Perform evaluation based on the inference results.
|
| 244 |
+
- viz: Display evaluation results only.
|
| 245 |
+
- `--max-partition-size 2000`: Dataset partition size. Some datasets may be large, and using this parameter can split them into multiple sub-tasks to efficiently utilize resources. However, if the partition is too fine, the overall speed may be slower due to longer model loading times.
|
| 246 |
+
- `--max-num-workers 32`: Maximum number of parallel tasks. In distributed environments such as Slurm, this parameter specifies the maximum number of submitted tasks. In a local environment, it specifies the maximum number of tasks executed in parallel. Note that the actual number of parallel tasks depends on the available GPU resources and may not be equal to this number.
|
| 247 |
+
|
| 248 |
+
If you are not performing the evaluation on your local machine but using a Slurm cluster, you can specify the following parameters:
|
| 249 |
+
|
| 250 |
+
- `--slurm`: Submit tasks using Slurm on the cluster.
|
| 251 |
+
- `--partition(-p) my_part`: Slurm cluster partition.
|
| 252 |
+
- `--retry 2`: Number of retries for failed tasks.
|
| 253 |
+
|
| 254 |
+
```{seealso}
|
| 255 |
+
The entry also supports submitting tasks to Alibaba Deep Learning Center (DLC), and more customized evaluation strategies. Please refer to [Launching an Evaluation Task](../user_guides/experimentation.md#launching-an-evaluation-task) for details.
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
:::
|
| 259 |
+
|
| 260 |
+
## Visualizing Evaluation Results
|
| 261 |
+
|
| 262 |
+
After the evaluation is complete, the evaluation results table will be printed as follows:
|
| 263 |
+
|
| 264 |
+
```text
|
| 265 |
+
dataset version metric mode opt350m opt125m
|
| 266 |
+
--------- --------- -------- ------ --------- ---------
|
| 267 |
+
siqa e78df3 accuracy gen 21.55 12.44
|
| 268 |
+
winograd b6c7ed accuracy ppl 51.23 49.82
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
All run outputs will be directed to `outputs/demo/` directory with following structure:
|
| 272 |
+
|
| 273 |
+
```text
|
| 274 |
+
outputs/default/
|
| 275 |
+
├── 20200220_120000
|
| 276 |
+
├── 20230220_183030 # one experiment pre folder
|
| 277 |
+
│ ├── configs # Dumped config files for record. Multiple configs may be kept if different experiments have been re-run on the same experiment folder
|
| 278 |
+
│ ├── logs # log files for both inference and evaluation stages
|
| 279 |
+
│ │ ├── eval
|
| 280 |
+
│ │ └── infer
|
| 281 |
+
│ ├── predictions # Prediction results for each task
|
| 282 |
+
│ ├── results # Evaluation results for each task
|
| 283 |
+
│ └── summary # Summarized evaluation results for a single experiment
|
| 284 |
+
├── ...
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
The summarization process can be further customized in configuration and output the averaged score of some benchmarks (MMLU, C-Eval, etc.).
|
| 288 |
+
|
| 289 |
+
More information about obtaining evaluation results can be found in [Results Summary](../user_guides/summarizer.md).
|
| 290 |
+
|
| 291 |
+
## Additional Tutorials
|
| 292 |
+
|
| 293 |
+
To learn more about using OpenCompass, explore the following tutorials:
|
| 294 |
+
|
| 295 |
+
- [Prepare Datasets](../user_guides/datasets.md)
|
| 296 |
+
- [Prepare Models](../user_guides/models.md)
|
| 297 |
+
- [Task Execution and Monitoring](../user_guides/experimentation.md)
|
| 298 |
+
- [Understand Prompts](../prompt/overview.md)
|
| 299 |
+
- [Results Summary](../user_guides/summarizer.md)
|
| 300 |
+
- [Learn about Config](../user_guides/config.md)
|
opencompass/docs/en/index.rst
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Welcome to OpenCompass' documentation!
|
| 2 |
+
==========================================
|
| 3 |
+
|
| 4 |
+
Getting started with OpenCompass
|
| 5 |
+
-------------------------------
|
| 6 |
+
|
| 7 |
+
To help you quickly familiarized with OpenCompass, we recommend you to walk through the following documents in order:
|
| 8 |
+
|
| 9 |
+
- First read the GetStarted_ section set up the environment, and run a mini experiment.
|
| 10 |
+
|
| 11 |
+
- Then learn its basic usage through the UserGuides_.
|
| 12 |
+
|
| 13 |
+
- If you want to tune the prompts, refer to the Prompt_.
|
| 14 |
+
|
| 15 |
+
- If you want to customize some modules, like adding a new dataset or model, we have provided the AdvancedGuides_.
|
| 16 |
+
|
| 17 |
+
- There are more handy tools, such as prompt viewer and lark bot reporter, all presented in Tools_.
|
| 18 |
+
|
| 19 |
+
We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
|
| 20 |
+
|
| 21 |
+
.. _GetStarted:
|
| 22 |
+
.. toctree::
|
| 23 |
+
:maxdepth: 1
|
| 24 |
+
:caption: Get Started
|
| 25 |
+
|
| 26 |
+
get_started/installation.md
|
| 27 |
+
get_started/quick_start.md
|
| 28 |
+
get_started/faq.md
|
| 29 |
+
|
| 30 |
+
.. _UserGuides:
|
| 31 |
+
.. toctree::
|
| 32 |
+
:maxdepth: 1
|
| 33 |
+
:caption: User Guides
|
| 34 |
+
|
| 35 |
+
user_guides/framework_overview.md
|
| 36 |
+
user_guides/config.md
|
| 37 |
+
user_guides/datasets.md
|
| 38 |
+
user_guides/models.md
|
| 39 |
+
user_guides/evaluation.md
|
| 40 |
+
user_guides/experimentation.md
|
| 41 |
+
user_guides/metrics.md
|
| 42 |
+
user_guides/summarizer.md
|
| 43 |
+
user_guides/corebench.md
|
| 44 |
+
|
| 45 |
+
.. _Prompt:
|
| 46 |
+
.. toctree::
|
| 47 |
+
:maxdepth: 1
|
| 48 |
+
:caption: Prompt
|
| 49 |
+
|
| 50 |
+
prompt/overview.md
|
| 51 |
+
prompt/prompt_template.md
|
| 52 |
+
prompt/meta_template.md
|
| 53 |
+
prompt/chain_of_thought.md
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
.. _AdvancedGuides:
|
| 57 |
+
.. toctree::
|
| 58 |
+
:maxdepth: 1
|
| 59 |
+
:caption: Advanced Guides
|
| 60 |
+
|
| 61 |
+
advanced_guides/new_dataset.md
|
| 62 |
+
advanced_guides/custom_dataset.md
|
| 63 |
+
advanced_guides/new_model.md
|
| 64 |
+
advanced_guides/evaluation_lmdeploy.md
|
| 65 |
+
advanced_guides/evaluation_lightllm.md
|
| 66 |
+
advanced_guides/accelerator_intro.md
|
| 67 |
+
advanced_guides/code_eval.md
|
| 68 |
+
advanced_guides/code_eval_service.md
|
| 69 |
+
advanced_guides/prompt_attack.md
|
| 70 |
+
advanced_guides/longeval.md
|
| 71 |
+
advanced_guides/subjective_evaluation.md
|
| 72 |
+
advanced_guides/circular_eval.md
|
| 73 |
+
advanced_guides/contamination_eval.md
|
| 74 |
+
advanced_guides/needleinahaystack_eval.md
|
| 75 |
+
|
| 76 |
+
.. _Tools:
|
| 77 |
+
.. toctree::
|
| 78 |
+
:maxdepth: 1
|
| 79 |
+
:caption: Tools
|
| 80 |
+
|
| 81 |
+
tools.md
|
| 82 |
+
|
| 83 |
+
.. _Notes:
|
| 84 |
+
.. toctree::
|
| 85 |
+
:maxdepth: 1
|
| 86 |
+
:caption: Notes
|
| 87 |
+
|
| 88 |
+
notes/contribution_guide.md
|
| 89 |
+
|
| 90 |
+
Indexes & Tables
|
| 91 |
+
==================
|
| 92 |
+
|
| 93 |
+
* :ref:`genindex`
|
| 94 |
+
* :ref:`search`
|
opencompass/docs/en/prompt/meta_template.md
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Meta Template
|
| 2 |
+
|
| 3 |
+
## Background
|
| 4 |
+
|
| 5 |
+
In the Supervised Fine-Tuning (SFT) process of Language Model Learning (LLM), we often inject some predefined strings into the conversation according to actual requirements, in order to prompt the model to output content according to certain guidelines. For example, in some `chat` model fine-tuning, we may add system-level instructions at the beginning of each dialogue, and establish a format to represent the conversation between the user and the model. In a conversation, the model may expect the text format to be as follows:
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
Meta instruction: You are now a helpful and harmless AI assistant.
|
| 9 |
+
HUMAN: Hi!<eoh>\n
|
| 10 |
+
Bot: Hello! How may I assist you?<eob>\n
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
During evaluation, we also need to enter questions according to the agreed format for the model to perform its best.
|
| 14 |
+
|
| 15 |
+
In addition, similar situations exist in API models. General API dialogue models allow users to pass in historical dialogues when calling, and some models also allow the input of SYSTEM level instructions. To better evaluate the ability of API models, we hope to make the data as close as possible to the multi-round dialogue template of the API model itself during the evaluation, rather than stuffing all the content into an instruction.
|
| 16 |
+
|
| 17 |
+
Therefore, we need to specify different parsing templates for different models. In OpenCompass, we call this set of parsing templates **Meta Template**. Meta Template is tied to the model's configuration and is combined with the dialogue template of the dataset during runtime to ultimately generate the most suitable prompt for the current model.
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
# When specifying, just pass the meta_template field into the model
|
| 21 |
+
models = [
|
| 22 |
+
dict(
|
| 23 |
+
type='AnyModel',
|
| 24 |
+
meta_template = ..., # meta template
|
| 25 |
+
)
|
| 26 |
+
]
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
Next, we will introduce how to configure Meta Template on two types of models.
|
| 30 |
+
You are recommended to read [here](./prompt_template.md#dialogue-prompt) for the basic syntax of the dialogue template before reading this chapter.
|
| 31 |
+
|
| 32 |
+
```{note}
|
| 33 |
+
In some cases (such as testing the base station), we don't need to inject any instructions into the normal dialogue, in which case we can leave the meta template empty. In this case, the prompt received by the model is defined only by the dataset configuration and is a regular string. If the dataset configuration uses a dialogue template, speeches from different roles will be concatenated with \n.
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Application on Language Models
|
| 37 |
+
|
| 38 |
+
The following figure shows several situations where the data is built into a prompt through the prompt template and meta template from the dataset in the case of 2-shot learning. Readers can use this figure as a reference to help understand the following sections.
|
| 39 |
+
|
| 40 |
+

|
| 41 |
+
|
| 42 |
+
We will explain how to define the meta template with several examples.
|
| 43 |
+
|
| 44 |
+
Suppose that according to the dialogue template of the dataset, the following dialogue was produced:
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
PromptList([
|
| 48 |
+
dict(role='HUMAN', prompt='1+1=?'),
|
| 49 |
+
dict(role='BOT', prompt='2'),
|
| 50 |
+
dict(role='HUMAN', prompt='2+2=?'),
|
| 51 |
+
dict(role='BOT', prompt='4'),
|
| 52 |
+
])
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
We want to pass this dialogue to a model that has already gone through SFT. The model's agreed dialogue begins with the speech of different roles with `<Role Name>:` and ends with a special token and \\n. Here is the complete string the model expects to receive:
|
| 56 |
+
|
| 57 |
+
```Plain
|
| 58 |
+
<HUMAN>: 1+1=?<eoh>
|
| 59 |
+
<BOT>: 2<eob>
|
| 60 |
+
<HUMAN>: 2+2=?<eoh>
|
| 61 |
+
<BOT>: 4<eob>
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
In the meta template, we only need to abstract the format of each round of dialogue into the following configuration:
|
| 65 |
+
|
| 66 |
+
```python
|
| 67 |
+
# model meta template
|
| 68 |
+
meta_template = dict(
|
| 69 |
+
round=[
|
| 70 |
+
dict(role='HUMAN', begin='<HUMAN>: ', end='<eoh>\n'),
|
| 71 |
+
dict(role='BOT', begin='<BOT>: ', end='<eob>\n'),
|
| 72 |
+
],
|
| 73 |
+
)
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
______________________________________________________________________
|
| 77 |
+
|
| 78 |
+
Some datasets may introduce SYSTEM-level roles:
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
PromptList([
|
| 82 |
+
dict(role='SYSTEM', fallback_role='HUMAN', prompt='Solve the following math questions'),
|
| 83 |
+
dict(role='HUMAN', prompt='1+1=?'),
|
| 84 |
+
dict(role='BOT', prompt='2'),
|
| 85 |
+
dict(role='HUMAN', prompt='2+2=?'),
|
| 86 |
+
dict(role='BOT', prompt='4'),
|
| 87 |
+
])
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Assuming the model also accepts the SYSTEM role, and expects the input to be:
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
<SYSTEM>: Solve the following math questions<eosys>\n
|
| 94 |
+
<HUMAN>: 1+1=?<eoh>\n
|
| 95 |
+
<BOT>: 2<eob>\n
|
| 96 |
+
<HUMAN>: 2+2=?<eoh>\n
|
| 97 |
+
<BOT>: 4<eob>\n
|
| 98 |
+
end of conversation
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
We can put the definition of the SYSTEM role into `reserved_roles`. Roles in `reserved_roles` will not appear in regular conversations, but they allow the dialogue template of the dataset configuration to call them in `begin` or `end`.
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
# model meta template
|
| 105 |
+
meta_template = dict(
|
| 106 |
+
round=[
|
| 107 |
+
dict(role='HUMAN', begin='<HUMAN>: ', end='<eoh>\n'),
|
| 108 |
+
dict(role='BOT', begin='<BOT>: ', end='<eob>\n'),
|
| 109 |
+
],
|
| 110 |
+
reserved_roles=[dict(role='SYSTEM', begin='<SYSTEM>: ', end='<eosys>\n'),],
|
| 111 |
+
),
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
If the model does not accept the SYSTEM role, it is not necessary to configure this item, and it can still run normally. In this case, the string received by the model becomes:
|
| 115 |
+
|
| 116 |
+
```
|
| 117 |
+
<HUMAN>: Solve the following math questions<eoh>\n
|
| 118 |
+
<HUMAN>: 1+1=?<eoh>\n
|
| 119 |
+
<BOT>: 2<eob>\n
|
| 120 |
+
<HUMAN>: 2+2=?<eoh>\n
|
| 121 |
+
<BOT>: 4<eob>\n
|
| 122 |
+
end of conversation
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
This is because in the predefined datasets in OpenCompass, each `SYSTEM` speech has a `fallback_role='HUMAN'`, that is, if the `SYSTEM` role in the meta template does not exist, the speaker will be switched to the `HUMAN` role.
|
| 126 |
+
|
| 127 |
+
______________________________________________________________________
|
| 128 |
+
|
| 129 |
+
Some models may need to consider embedding other strings at the beginning or end of the conversation, such as system instructions:
|
| 130 |
+
|
| 131 |
+
```
|
| 132 |
+
Meta instruction: You are now a helpful and harmless AI assistant.
|
| 133 |
+
<SYSTEM>: Solve the following math questions<eosys>\n
|
| 134 |
+
<HUMAN>: 1+1=?<eoh>\n
|
| 135 |
+
<BOT>: 2<eob>\n
|
| 136 |
+
<HUMAN>: 2+2=?<eoh>\n
|
| 137 |
+
<BOT>: 4<eob>\n
|
| 138 |
+
end of conversation
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
In this case, we can specify these strings by specifying the begin and end parameters.
|
| 142 |
+
|
| 143 |
+
```python
|
| 144 |
+
meta_template = dict(
|
| 145 |
+
round=[
|
| 146 |
+
dict(role='HUMAN', begin='<HUMAN>: ', end='<eoh>\n'),
|
| 147 |
+
dict(role='BOT', begin='<BOT>: ', end='<eob>\n'),
|
| 148 |
+
],
|
| 149 |
+
reserved_roles=[dict(role='SYSTEM', begin='<SYSTEM>: ', end='<eosys>\n'),],
|
| 150 |
+
begin="Meta instruction: You are now a helpful and harmless AI assistant.",
|
| 151 |
+
end="end of conversation",
|
| 152 |
+
),
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
______________________________________________________________________
|
| 156 |
+
|
| 157 |
+
In **generative** task evaluation, we will not directly input the answer to the model, but by truncating the prompt, while retaining the previous text, we leave the answer output by the model blank.
|
| 158 |
+
|
| 159 |
+
```
|
| 160 |
+
Meta instruction: You are now a helpful and harmless AI assistant.
|
| 161 |
+
<SYSTEM>: Solve the following math questions<eosys>\n
|
| 162 |
+
<HUMAN>: 1+1=?<eoh>\n
|
| 163 |
+
<BOT>: 2<eob>\n
|
| 164 |
+
<HUMAN>: 2+2=?<eoh>\n
|
| 165 |
+
<BOT>:
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
We only need to set the `generate` field in BOT's configuration to True, and OpenCompass will automatically leave the last utterance of BOT blank:
|
| 169 |
+
|
| 170 |
+
```python
|
| 171 |
+
# model meta template
|
| 172 |
+
meta_template = dict(
|
| 173 |
+
round=[
|
| 174 |
+
dict(role='HUMAN', begin='<HUMAN>: ', end='<eoh>\n'),
|
| 175 |
+
dict(role='BOT', begin='<BOT>: ', end='<eob>\n', generate=True),
|
| 176 |
+
],
|
| 177 |
+
reserved_roles=[dict(role='SYSTEM', begin='<SYSTEM>: ', end='<eosys>\n'),],
|
| 178 |
+
begin="Meta instruction: You are now a helpful and harmless AI assistant.",
|
| 179 |
+
end="end of conversation",
|
| 180 |
+
),
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
Note that `generate` only affects generative inference. When performing discriminative inference, the prompt received by the model is still complete.
|
| 184 |
+
|
| 185 |
+
### Full Definition
|
| 186 |
+
|
| 187 |
+
```bash
|
| 188 |
+
models = [
|
| 189 |
+
dict(meta_template = dict(
|
| 190 |
+
begin="Meta instruction: You are now a helpful and harmless AI assistant.",
|
| 191 |
+
round=[
|
| 192 |
+
dict(role='HUMAN', begin='HUMAN: ', end='<eoh>\n'), # begin and end can be a list of strings or integers.
|
| 193 |
+
dict(role='THOUGHTS', begin='THOUGHTS: ', end='<eot>\n', prompt='None'), # Here we can set the default prompt, which may be overridden by the specific dataset
|
| 194 |
+
dict(role='BOT', begin='BOT: ', generate=True, end='<eob>\n'),
|
| 195 |
+
],
|
| 196 |
+
end="end of conversion",
|
| 197 |
+
reserved_roles=[dict(role='SYSTEM', begin='SYSTEM: ', end='\n'),],
|
| 198 |
+
eos_token_id=10000,
|
| 199 |
+
),
|
| 200 |
+
)
|
| 201 |
+
]
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
The `meta_template` is a dictionary that can contain the following fields:
|
| 205 |
+
|
| 206 |
+
- `begin`, `end`: (str, optional) The beginning and ending of the prompt, typically some system-level instructions.
|
| 207 |
+
|
| 208 |
+
- `round`: (list) The template format of each round of dialogue. The content of the prompt for each round of dialogue is controlled by the dialogue template configured in the dataset.
|
| 209 |
+
|
| 210 |
+
- `reserved_roles`: (list, optional) Specify roles that do not appear in `round` but may be used in the dataset configuration, such as the `SYSTEM` role.
|
| 211 |
+
|
| 212 |
+
- `eos_token_id`: (int, optional): Specifies the ID of the model's eos token. If not set, it defaults to the eos token id in the tokenizer. Its main role is to trim the output of the model in generative tasks, so it should generally be set to the first token id of the end corresponding to the item with generate=True.
|
| 213 |
+
|
| 214 |
+
The `round` of the `meta_template` specifies the format of each role's speech in a round of dialogue. It accepts a list of dictionaries, each dictionary's keys are as follows:
|
| 215 |
+
|
| 216 |
+
- `role` (str): The name of the role participating in the dialogue. This string does not affect the actual prompt.
|
| 217 |
+
|
| 218 |
+
- `begin`, `end` (str): Specifies the fixed beginning or end when this role speaks.
|
| 219 |
+
|
| 220 |
+
- `prompt` (str): The role's prompt. It is allowed to leave it blank in the meta template, but in this case, it must be specified in the prompt of the dataset configuration.
|
| 221 |
+
|
| 222 |
+
- `generate` (bool): When specified as True, this role is the one the model plays. In generation tasks, the prompt received by the model will be cut off at the `begin` of this role, and the remaining content will be filled by the model.
|
| 223 |
+
|
| 224 |
+
## Application to API Models
|
| 225 |
+
|
| 226 |
+
The meta template of the API model is similar to the meta template of the general model, but the configuration is simpler. Users can, as per their requirements, directly use one of the two configurations below to evaluate the API model in a multi-turn dialogue manner:
|
| 227 |
+
|
| 228 |
+
```bash
|
| 229 |
+
# If the API model does not support system instructions
|
| 230 |
+
meta_template=dict(
|
| 231 |
+
round=[
|
| 232 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 233 |
+
dict(role='BOT', api_role='BOT', generate=True)
|
| 234 |
+
],
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# If the API model supports system instructions
|
| 238 |
+
meta_template=dict(
|
| 239 |
+
round=[
|
| 240 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 241 |
+
dict(role='BOT', api_role='BOT', generate=True)
|
| 242 |
+
],
|
| 243 |
+
reserved_roles=[
|
| 244 |
+
dict(role='SYSTEM', api_role='SYSTEM'),
|
| 245 |
+
],
|
| 246 |
+
)
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
### Principle
|
| 250 |
+
|
| 251 |
+
Even though different API models accept different data structures, there are commonalities overall. Interfaces that accept dialogue history generally allow users to pass in prompts from the following three roles:
|
| 252 |
+
|
| 253 |
+
- User
|
| 254 |
+
- Robot
|
| 255 |
+
- System (optional)
|
| 256 |
+
|
| 257 |
+
In this regard, OpenCompass has preset three `api_role` values for API models: `HUMAN`, `BOT`, `SYSTEM`, and stipulates that in addition to regular strings, the input accepted by API models includes a middle format of dialogue represented by `PromptList`. The API model will repackage the dialogue in a multi-turn dialogue format and send it to the backend. However, to activate this feature, users need to map the roles `role` in the dataset prompt template to the corresponding `api_role` in the above meta template. The following figure illustrates the relationship between the input accepted by the API model and the Prompt Template and Meta Template.
|
| 258 |
+
|
| 259 |
+

|
| 260 |
+
|
| 261 |
+
## Debugging
|
| 262 |
+
|
| 263 |
+
If you need to debug the prompt, it is recommended to use the `tools/prompt_viewer.py` script to preview the actual prompt received by the model after preparing the configuration file. Read [here](../tools.md#prompt-viewer) for more.
|
opencompass/docs/en/tools.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Useful Tools
|
| 2 |
+
|
| 3 |
+
## Prompt Viewer
|
| 4 |
+
|
| 5 |
+
This tool allows you to directly view the generated prompt without starting the full training process. If the passed configuration is only the dataset configuration (such as `configs/datasets/nq/nq_gen.py`), it will display the original prompt defined in the dataset configuration. If it is a complete evaluation configuration (including the model and the dataset), it will display the prompt received by the selected model during operation.
|
| 6 |
+
|
| 7 |
+
Running method:
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
python tools/prompt_viewer.py CONFIG_PATH [-n] [-a] [-p PATTERN]
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
- `-n`: Do not enter interactive mode, select the first model (if any) and dataset by default.
|
| 14 |
+
- `-a`: View the prompts received by all models and all dataset combinations in the configuration.
|
| 15 |
+
- `-p PATTERN`: Do not enter interactive mode, select all datasets that match the input regular expression.
|
| 16 |
+
|
| 17 |
+
## Case Analyzer (To be updated)
|
| 18 |
+
|
| 19 |
+
Based on existing evaluation results, this tool produces inference error samples and full samples with annotation information.
|
| 20 |
+
|
| 21 |
+
Running method:
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
python tools/case_analyzer.py CONFIG_PATH [-w WORK_DIR]
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
- `-w`: Work path, default is `'./outputs/default'`.
|
| 28 |
+
|
| 29 |
+
## Lark Bot
|
| 30 |
+
|
| 31 |
+
Users can configure the Lark bot to implement real-time monitoring of task status. Please refer to [this document](https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN?lang=zh-CN#7a28964d) for setting up the Lark bot.
|
| 32 |
+
|
| 33 |
+
Configuration method:
|
| 34 |
+
|
| 35 |
+
- Open the `configs/secrets.py` file, and add the following line to the file:
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
lark_bot_url = 'YOUR_WEBHOOK_URL'
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
- Normally, the Webhook URL format is like https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxxxxx .
|
| 42 |
+
|
| 43 |
+
- Inherit this file in the complete evaluation configuration
|
| 44 |
+
|
| 45 |
+
- To avoid the bot sending messages frequently and causing disturbance, the running status will not be reported automatically by default. If necessary, you can start status reporting through `-l` or `--lark`:
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
python run.py configs/eval_demo.py -l
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## API Model Tester
|
| 52 |
+
|
| 53 |
+
This tool can quickly test whether the functionality of the API model is normal.
|
| 54 |
+
|
| 55 |
+
Running method:
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
python tools/test_api_model.py [CONFIG_PATH] -n
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Prediction Merger
|
| 62 |
+
|
| 63 |
+
This tool can merge patitioned predictions.
|
| 64 |
+
|
| 65 |
+
Running method:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
python tools/prediction_merger.py CONFIG_PATH [-w WORK_DIR]
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
- `-w`: Work path, default is `'./outputs/default'`.
|
| 72 |
+
|
| 73 |
+
## List Configs
|
| 74 |
+
|
| 75 |
+
This tool can list or search all available model and dataset configurations. It supports fuzzy search, making it convenient for use in conjunction with `run.py`.
|
| 76 |
+
|
| 77 |
+
Usage:
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
python tools/list_configs.py [PATTERN1] [PATTERN2] [...]
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
If executed without any parameters, it will list all model configurations in the `configs/models` and `configs/dataset` directories by default.
|
| 84 |
+
|
| 85 |
+
Users can also pass any number of parameters. The script will list all configurations related to the provided strings, supporting fuzzy search and the use of the * wildcard. For example, the following command will list all configurations related to `mmlu` and `llama`:
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
python tools/list_configs.py mmlu llama
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
Its output could be:
|
| 92 |
+
|
| 93 |
+
```text
|
| 94 |
+
+-----------------+-----------------------------------+
|
| 95 |
+
| Model | Config Path |
|
| 96 |
+
|-----------------+-----------------------------------|
|
| 97 |
+
| hf_llama2_13b | configs/models/hf_llama2_13b.py |
|
| 98 |
+
| hf_llama2_70b | configs/models/hf_llama2_70b.py |
|
| 99 |
+
| hf_llama2_7b | configs/models/hf_llama2_7b.py |
|
| 100 |
+
| hf_llama_13b | configs/models/hf_llama_13b.py |
|
| 101 |
+
| hf_llama_30b | configs/models/hf_llama_30b.py |
|
| 102 |
+
| hf_llama_65b | configs/models/hf_llama_65b.py |
|
| 103 |
+
| hf_llama_7b | configs/models/hf_llama_7b.py |
|
| 104 |
+
| llama2_13b_chat | configs/models/llama2_13b_chat.py |
|
| 105 |
+
| llama2_70b_chat | configs/models/llama2_70b_chat.py |
|
| 106 |
+
| llama2_7b_chat | configs/models/llama2_7b_chat.py |
|
| 107 |
+
+-----------------+-----------------------------------+
|
| 108 |
+
+-------------------+---------------------------------------------------+
|
| 109 |
+
| Dataset | Config Path |
|
| 110 |
+
|-------------------+---------------------------------------------------|
|
| 111 |
+
| cmmlu_gen | configs/datasets/cmmlu/cmmlu_gen.py |
|
| 112 |
+
| cmmlu_gen_ffe7c0 | configs/datasets/cmmlu/cmmlu_gen_ffe7c0.py |
|
| 113 |
+
| cmmlu_ppl | configs/datasets/cmmlu/cmmlu_ppl.py |
|
| 114 |
+
| cmmlu_ppl_fd1f2f | configs/datasets/cmmlu/cmmlu_ppl_fd1f2f.py |
|
| 115 |
+
| mmlu_gen | configs/datasets/mmlu/mmlu_gen.py |
|
| 116 |
+
| mmlu_gen_23a9a9 | configs/datasets/mmlu/mmlu_gen_23a9a9.py |
|
| 117 |
+
| mmlu_gen_5d1409 | configs/datasets/mmlu/mmlu_gen_5d1409.py |
|
| 118 |
+
| mmlu_gen_79e572 | configs/datasets/mmlu/mmlu_gen_79e572.py |
|
| 119 |
+
| mmlu_gen_a484b3 | configs/datasets/mmlu/mmlu_gen_a484b3.py |
|
| 120 |
+
| mmlu_ppl | configs/datasets/mmlu/mmlu_ppl.py |
|
| 121 |
+
| mmlu_ppl_ac766d | configs/datasets/mmlu/mmlu_ppl_ac766d.py |
|
| 122 |
+
+-------------------+---------------------------------------------------+
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## Dataset Suffix Updater
|
| 126 |
+
|
| 127 |
+
This tool can quickly modify the suffixes of configuration files located under the `configs/dataset` directory, aligning them with the naming conventions based on prompt hash.
|
| 128 |
+
|
| 129 |
+
How to run:
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
python tools/update_dataset_suffix.py
|
| 133 |
+
```
|
opencompass/docs/zh_cn/.readthedocs.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: 2
|
| 2 |
+
|
| 3 |
+
# Set the version of Python and other tools you might need
|
| 4 |
+
build:
|
| 5 |
+
os: ubuntu-22.04
|
| 6 |
+
tools:
|
| 7 |
+
python: "3.8"
|
| 8 |
+
|
| 9 |
+
formats:
|
| 10 |
+
- epub
|
| 11 |
+
|
| 12 |
+
sphinx:
|
| 13 |
+
configuration: docs/zh_cn/conf.py
|
| 14 |
+
|
| 15 |
+
python:
|
| 16 |
+
install:
|
| 17 |
+
- requirements: requirements/docs.txt
|
opencompass/docs/zh_cn/Makefile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal makefile for Sphinx documentation
|
| 2 |
+
#
|
| 3 |
+
|
| 4 |
+
# You can set these variables from the command line, and also
|
| 5 |
+
# from the environment for the first two.
|
| 6 |
+
SPHINXOPTS ?=
|
| 7 |
+
SPHINXBUILD ?= sphinx-build
|
| 8 |
+
SOURCEDIR = .
|
| 9 |
+
BUILDDIR = _build
|
| 10 |
+
|
| 11 |
+
# Put it first so that "make" without argument is like "make help".
|
| 12 |
+
help:
|
| 13 |
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
| 14 |
+
|
| 15 |
+
.PHONY: help Makefile
|
| 16 |
+
|
| 17 |
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
| 18 |
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
| 19 |
+
%: Makefile
|
| 20 |
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
opencompass/docs/zh_cn/_static/css/readthedocs.css
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.header-logo {
|
| 2 |
+
background-image: url("../image/logo.svg");
|
| 3 |
+
background-size: 275px 80px;
|
| 4 |
+
height: 80px;
|
| 5 |
+
width: 275px;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
@media screen and (min-width: 1100px) {
|
| 9 |
+
.header-logo {
|
| 10 |
+
top: -25px;
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
pre {
|
| 15 |
+
white-space: pre;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
@media screen and (min-width: 2000px) {
|
| 19 |
+
.pytorch-content-left {
|
| 20 |
+
width: 1200px;
|
| 21 |
+
margin-left: 30px;
|
| 22 |
+
}
|
| 23 |
+
article.pytorch-article {
|
| 24 |
+
max-width: 1200px;
|
| 25 |
+
}
|
| 26 |
+
.pytorch-breadcrumbs-wrapper {
|
| 27 |
+
width: 1200px;
|
| 28 |
+
}
|
| 29 |
+
.pytorch-right-menu.scrolling-fixed {
|
| 30 |
+
position: fixed;
|
| 31 |
+
top: 45px;
|
| 32 |
+
left: 1580px;
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
article.pytorch-article section code {
|
| 38 |
+
padding: .2em .4em;
|
| 39 |
+
background-color: #f3f4f7;
|
| 40 |
+
border-radius: 5px;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/* Disable the change in tables */
|
| 44 |
+
article.pytorch-article section table code {
|
| 45 |
+
padding: unset;
|
| 46 |
+
background-color: unset;
|
| 47 |
+
border-radius: unset;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
table.autosummary td {
|
| 51 |
+
width: 50%
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
img.align-center {
|
| 55 |
+
display: block;
|
| 56 |
+
margin-left: auto;
|
| 57 |
+
margin-right: auto;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
article.pytorch-article p.rubric {
|
| 61 |
+
font-weight: bold;
|
| 62 |
+
}
|
opencompass/docs/zh_cn/_static/image/logo.svg
ADDED
|
|
opencompass/docs/zh_cn/_static/image/logo_icon.svg
ADDED
|
|
opencompass/docs/zh_cn/_static/js/custom.js
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
var collapsedSections = [];
|
| 2 |
+
|
| 3 |
+
$(document).ready(function () {
|
| 4 |
+
$('.model-summary').DataTable({
|
| 5 |
+
"stateSave": false,
|
| 6 |
+
"lengthChange": false,
|
| 7 |
+
"pageLength": 20,
|
| 8 |
+
"order": []
|
| 9 |
+
});
|
| 10 |
+
});
|
opencompass/docs/zh_cn/_templates/404.html
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "layout.html" %}
|
| 2 |
+
|
| 3 |
+
{% block body %}
|
| 4 |
+
|
| 5 |
+
<h1>Page Not Found</h1>
|
| 6 |
+
<p>
|
| 7 |
+
The page you are looking for cannot be found.
|
| 8 |
+
</p>
|
| 9 |
+
<p>
|
| 10 |
+
If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
|
| 11 |
+
the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
|
| 12 |
+
</p>
|
| 13 |
+
<!-- <p>
|
| 14 |
+
If you cannot find documentation you want, please <a
|
| 15 |
+
href="">open an issue</a> to tell us!
|
| 16 |
+
</p> -->
|
| 17 |
+
|
| 18 |
+
{% endblock %}
|
opencompass/docs/zh_cn/_templates/autosummary/class.rst
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. role:: hidden
|
| 2 |
+
:class: hidden-section
|
| 3 |
+
.. currentmodule:: {{ module }}
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
{{ name | underline}}
|
| 7 |
+
|
| 8 |
+
.. autoclass:: {{ name }}
|
| 9 |
+
:members:
|
| 10 |
+
|
| 11 |
+
..
|
| 12 |
+
autogenerated from _templates/autosummary/class.rst
|
| 13 |
+
note it does not have :inherited-members:
|
opencompass/docs/zh_cn/_templates/callable.rst
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.. role:: hidden
|
| 2 |
+
:class: hidden-section
|
| 3 |
+
.. currentmodule:: {{ module }}
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
{{ name | underline}}
|
| 7 |
+
|
| 8 |
+
.. autoclass:: {{ name }}
|
| 9 |
+
:members:
|
| 10 |
+
:special-members: __call__
|
| 11 |
+
|
| 12 |
+
..
|
| 13 |
+
autogenerated from _templates/callable.rst
|
| 14 |
+
note it does not have :inherited-members:
|
opencompass/docs/zh_cn/advanced_guides/accelerator_intro.md
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 使用 vLLM 或 LMDeploy 来一键式加速评测推理
|
| 2 |
+
|
| 3 |
+
## 背景
|
| 4 |
+
|
| 5 |
+
在 OpenCompass 评测过程中,默认使用 Huggingface 的 transformers 库进行推理,这是一个非常通用的方案,但在某些情况下,我们可能需要更高效的推理方法来加速这一过程,比如借助 VLLM 或 LMDeploy。
|
| 6 |
+
|
| 7 |
+
- [LMDeploy](https://github.com/InternLM/lmdeploy) 是一个用于压缩、部署和服务大型语言模型(LLM)的工具包,由 [MMRazor](https://github.com/open-mmlab/mmrazor) 和 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 团队开发。
|
| 8 |
+
- [vLLM](https://github.com/vllm-project/vllm) 是一个快速且易于使用的 LLM 推理和服务库,具有先进的服务吞吐量、高效的 PagedAttention 内存管理、连续批处理请求、CUDA/HIP 图的快速模型执行、量化技术(如 GPTQ、AWQ、SqueezeLLM、FP8 KV Cache)以及优化的 CUDA 内核。
|
| 9 |
+
|
| 10 |
+
## 加速前准备
|
| 11 |
+
|
| 12 |
+
首先,请检查您要评测的模型是否支持使用 vLLM 或 LMDeploy 进行推理加速。其次,请确保您已经安装了 vLLM 或 LMDeploy,具体安装方法请参考它们的官方文档,下面是参考的安装方法:
|
| 13 |
+
|
| 14 |
+
### LMDeploy 安装方法
|
| 15 |
+
|
| 16 |
+
使用 pip (Python 3.8+) 或从 [源码](https://github.com/InternLM/lmdeploy/blob/main/docs/en/build.md) 安装 LMDeploy:
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
pip install lmdeploy
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
### VLLM 安装方法
|
| 23 |
+
|
| 24 |
+
使用 pip 或从 [源码](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source) 安装 vLLM:
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
pip install vllm
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## 评测时使用 VLLM 或 LMDeploy
|
| 31 |
+
|
| 32 |
+
### 方法1:使用命令行参数来变更推理后端
|
| 33 |
+
|
| 34 |
+
OpenCompass 提供了一键式的评测加速,可以在评测过程中自动将 Huggingface 的 transformers 模型转化为 VLLM 或 LMDeploy 的模型,以便在评测过程中使用。以下是使用默认 Huggingface 版本的 llama3-8b-instruct 模型评测 GSM8k 数据集的样例代码:
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
# eval_gsm8k.py
|
| 38 |
+
from mmengine.config import read_base
|
| 39 |
+
|
| 40 |
+
with read_base():
|
| 41 |
+
# 选择一个数据集列表
|
| 42 |
+
from .datasets.gsm8k.gsm8k_0shot_gen_a58960 import gsm8k_datasets as datasets
|
| 43 |
+
# 选择一个感兴趣的模型
|
| 44 |
+
from ..models.hf_llama.hf_llama3_8b_instruct import models
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
其中 `hf_llama3_8b_instruct` 为原版 Huggingface 模型配置,内容如下:
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
from opencompass.models import HuggingFacewithChatTemplate
|
| 51 |
+
|
| 52 |
+
models = [
|
| 53 |
+
dict(
|
| 54 |
+
type=HuggingFacewithChatTemplate,
|
| 55 |
+
abbr='llama-3-8b-instruct-hf',
|
| 56 |
+
path='meta-llama/Meta-Llama-3-8B-Instruct',
|
| 57 |
+
max_out_len=1024,
|
| 58 |
+
batch_size=8,
|
| 59 |
+
run_cfg=dict(num_gpus=1),
|
| 60 |
+
stop_words=['<|end_of_text|>', '<|eot_id|>'],
|
| 61 |
+
)
|
| 62 |
+
]
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
默认 Huggingface 版本的 Llama3-8b-instruct 模型评测 GSM8k 数据集的方式如下:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
python run.py config/eval_gsm8k.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
如果需要使用 vLLM 或 LMDeploy 进行加速评测,可以使用下面的脚本:
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
python run.py config/eval_gsm8k.py -a vllm
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
或
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
python run.py config/eval_gsm8k.py -a lmdeploy
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### 方法2:通过部署推理加速服务API来加速评测
|
| 84 |
+
|
| 85 |
+
OpenCompass 还支持通过部署vLLM或LMDeploy的推理加速服务 API 来加速评测,参考步骤如下:
|
| 86 |
+
|
| 87 |
+
1. 安装openai包:
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
pip install openai
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
2. 部署 vLLM 或 LMDeploy 的推理加速服务 API,具体部署方法请参考它们的官方文档,下面以LMDeploy为例:
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name Meta-Llama-3-8B-Instruct --server-port 23333
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
api_server 启动时的参数可以通过命令行`lmdeploy serve api_server -h`查看。 比如,--tp 设置张量并行,--session-len 设置推理的最大上下文窗口长度,--cache-max-entry-count 调整 k/v cache 的内存使用比例等等。
|
| 100 |
+
|
| 101 |
+
3. 服务部署成功后,修改评测脚本,将模型配置中的路径改为部署的服务地址,如下:
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from opencompass.models import OpenAISDK
|
| 105 |
+
|
| 106 |
+
api_meta_template = dict(
|
| 107 |
+
round=[
|
| 108 |
+
dict(role='HUMAN', api_role='HUMAN'),
|
| 109 |
+
dict(role='BOT', api_role='BOT', generate=True),
|
| 110 |
+
],
|
| 111 |
+
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
models = [
|
| 115 |
+
dict(
|
| 116 |
+
abbr='Meta-Llama-3-8B-Instruct-LMDeploy-API',
|
| 117 |
+
type=OpenAISDK,
|
| 118 |
+
key='EMPTY', # API key
|
| 119 |
+
openai_api_base='http://0.0.0.0:23333/v1', # 服务地址
|
| 120 |
+
path='Meta-Llama-3-8B-Instruct ', # 请求服务时的 model name
|
| 121 |
+
tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', # 请求服务时的 tokenizer name 或 path, 为None时使用默认tokenizer gpt-4
|
| 122 |
+
rpm_verbose=True, # 是否打印请求速率
|
| 123 |
+
meta_template=api_meta_template, # 服务请求模板
|
| 124 |
+
query_per_second=1, # 服务请求速率
|
| 125 |
+
max_out_len=1024, # 最大输出长度
|
| 126 |
+
max_seq_len=4096, # 最大输入长度
|
| 127 |
+
temperature=0.01, # 生成温度
|
| 128 |
+
batch_size=8, # 批处理大小
|
| 129 |
+
retry=3, # 重试次数
|
| 130 |
+
)
|
| 131 |
+
]
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## 加速效果及性能对比
|
| 135 |
+
|
| 136 |
+
下面是使用 VLLM 或 LMDeploy 在单卡 A800 上 Llama-3-8B-Instruct 模型对 GSM8k 数据集进行加速评测的效果及性能对比表:
|
| 137 |
+
|
| 138 |
+
| 推理后端 | 精度(Accuracy) | 推理时间(分钟:秒) | 加速比(相对于 Huggingface) |
|
| 139 |
+
| ----------- | ---------------- | -------------------- | ---------------------------- |
|
| 140 |
+
| Huggingface | 74.22 | 24:26 | 1.0 |
|
| 141 |
+
| LMDeploy | 73.69 | 11:15 | 2.2 |
|
| 142 |
+
| VLLM | 72.63 | 07:52 | 3.1 |
|
opencompass/docs/zh_cn/advanced_guides/circular_eval.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 循环评测
|
| 2 |
+
|
| 3 |
+
## 背景
|
| 4 |
+
|
| 5 |
+
对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。
|
| 6 |
+
|
| 7 |
+
## 新增自己的循环评测数据集
|
| 8 |
+
|
| 9 |
+
一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。
|
| 10 |
+
|
| 11 |
+
OpenCompass 主库:
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
from opencompass.datasets.ceval import CEvalDataset
|
| 15 |
+
from opencompass.datasets.circular import CircularDatasetMeta
|
| 16 |
+
|
| 17 |
+
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
|
| 18 |
+
# 被重载的数据集类
|
| 19 |
+
dataset_class = CEvalDataset
|
| 20 |
+
|
| 21 |
+
# 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要
|
| 22 |
+
default_circular_splits = ['val', 'test']
|
| 23 |
+
|
| 24 |
+
# 需要被打乱的 key 列表
|
| 25 |
+
default_option_keys = ['A', 'B', 'C', 'D']
|
| 26 |
+
|
| 27 |
+
# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一
|
| 28 |
+
default_answer_key = 'answer'
|
| 29 |
+
|
| 30 |
+
# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
|
| 31 |
+
# def default_answer_key_switch_method(item, circular_pattern):
|
| 32 |
+
# # item 是原本的数据项
|
| 33 |
+
# # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推
|
| 34 |
+
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
|
| 35 |
+
# return item
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值:
|
| 39 |
+
|
| 40 |
+
- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
|
| 41 |
+
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种
|
| 42 |
+
|
| 43 |
+
另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标:
|
| 44 |
+
|
| 45 |
+
- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率
|
| 46 |
+
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率
|
| 47 |
+
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率
|
| 48 |
+
|
| 49 |
+
OpenCompass 配置文件:
|
| 50 |
+
|
| 51 |
+
```python
|
| 52 |
+
from mmengine.config import read_base
|
| 53 |
+
from opencompass.datasets.circular import CircularCEvalDataset
|
| 54 |
+
|
| 55 |
+
with read_base():
|
| 56 |
+
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
| 57 |
+
|
| 58 |
+
for d in ceval_datasets:
|
| 59 |
+
# 重载 load 方法
|
| 60 |
+
d['type'] = CircularCEvalDataset
|
| 61 |
+
# 为了与非循环评测版本做区分而进行改名
|
| 62 |
+
d['abbr'] = d['abbr'] + '-circular-4'
|
| 63 |
+
# 重载评测方法
|
| 64 |
+
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}
|
| 65 |
+
|
| 66 |
+
# 上述操作后的 dataset 形如下:
|
| 67 |
+
# dict(
|
| 68 |
+
# type=CircularCEvalDataset,
|
| 69 |
+
# path='./data/ceval/formal_ceval', # 未改变
|
| 70 |
+
# name='computer_network', # 未改变
|
| 71 |
+
# abbr='ceval-computer_network-circular-4',
|
| 72 |
+
# reader_cfg=dict(...), # 未改变
|
| 73 |
+
# infer_cfg=dict(...), # 未改变
|
| 74 |
+
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
|
| 75 |
+
# )
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
from mmengine.config import read_base
|
| 82 |
+
from opencompass.summarizers import CircularSummarizer
|
| 83 |
+
|
| 84 |
+
with read_base():
|
| 85 |
+
from ...summarizers.groups.ceval import ceval_summary_groups
|
| 86 |
+
|
| 87 |
+
new_summary_groups = []
|
| 88 |
+
for item in ceval_summary_groups:
|
| 89 |
+
new_summary_groups.append(
|
| 90 |
+
{
|
| 91 |
+
'name': item['name'] + '-circular-4',
|
| 92 |
+
'subsets': [i + '-circular-4' for i in item['subsets']],
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
summarizer = dict(
|
| 97 |
+
type=CircularSummarizer,
|
| 98 |
+
# 选择具体看哪些指标
|
| 99 |
+
metric_types=['acc_origin', 'perf_circular'],
|
| 100 |
+
dataset_abbrs = [
|
| 101 |
+
'ceval-circular-4',
|
| 102 |
+
'ceval-humanities-circular-4',
|
| 103 |
+
'ceval-stem-circular-4',
|
| 104 |
+
'ceval-social-science-circular-4',
|
| 105 |
+
'ceval-other-circular-4',
|
| 106 |
+
],
|
| 107 |
+
summary_groups=new_summary_groups,
|
| 108 |
+
)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
|
opencompass/docs/zh_cn/advanced_guides/code_eval.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 代码评测教程
|
| 2 |
+
|
| 3 |
+
这里以 `humaneval` 和 `mbpp` 为例,主要介绍如何评测模型的代码能力。
|
| 4 |
+
|
| 5 |
+
## pass@1
|
| 6 |
+
|
| 7 |
+
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
|
| 8 |
+
|
| 9 |
+
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。
|
| 10 |
+
|
| 11 |
+
## pass@k
|
| 12 |
+
|
| 13 |
+
如果对于单个example需要生成多条回复来评测pass@k的性能,需要参考以下两种情况。这里以10回复为例子:
|
| 14 |
+
|
| 15 |
+
### 通常情况
|
| 16 |
+
|
| 17 |
+
对于绝大多数模型来说,模型支持HF的generation中带有`num_return_sequences` 参数,我们可以直接使用来获取多回复。可以参考以下配置文件。
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
|
| 21 |
+
|
| 22 |
+
with read_base():
|
| 23 |
+
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
| 24 |
+
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
| 25 |
+
|
| 26 |
+
mbpp_datasets[0]['type'] = MBPPDatasetV2
|
| 27 |
+
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
| 28 |
+
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
|
| 29 |
+
|
| 30 |
+
datasets = []
|
| 31 |
+
datasets += humaneval_datasets
|
| 32 |
+
datasets += mbpp_datasets
|
| 33 |
+
|
| 34 |
+
models = [
|
| 35 |
+
dict(
|
| 36 |
+
type=HuggingFaceCausalLM,
|
| 37 |
+
...,
|
| 38 |
+
generation_kwargs=dict(
|
| 39 |
+
num_return_sequences=10,
|
| 40 |
+
do_sample=True,
|
| 41 |
+
top_p=0.95,
|
| 42 |
+
temperature=0.8,
|
| 43 |
+
),
|
| 44 |
+
...,
|
| 45 |
+
)
|
| 46 |
+
]
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
对于 `mbpp`,在数据集和评测上需要有新的变更,所以同步修改`type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` 字段来适应新的需求。
|
| 50 |
+
|
| 51 |
+
另外我们需要模型的回复有随机性,同步需要设置`generation_kwargs`参数。这里注意要设置`num_return_sequences`得到回复数。
|
| 52 |
+
|
| 53 |
+
注意:`num_return_sequences` 必须大于等于k,本身pass@k是计算的概率估计。
|
| 54 |
+
|
| 55 |
+
具体可以参考以下配置文件
|
| 56 |
+
[configs/eval_code_passk.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk.py)
|
| 57 |
+
|
| 58 |
+
### 模型不支持多回复
|
| 59 |
+
|
| 60 |
+
适用于一些没有设计好的API以及功能缺失的HF模型。这个时候我们需要重复构造数据集来达到多回复的效果。这里可以参考以下配置文件。
|
| 61 |
+
|
| 62 |
+
```python
|
| 63 |
+
from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
|
| 64 |
+
|
| 65 |
+
with read_base():
|
| 66 |
+
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
| 67 |
+
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
| 68 |
+
|
| 69 |
+
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
| 70 |
+
humaneval_datasets[0]['num_repeats'] = 10
|
| 71 |
+
mbpp_datasets[0]['abbr'] = 'mbpp_pass10'
|
| 72 |
+
mbpp_datasets[0]['num_repeats'] = 10
|
| 73 |
+
mbpp_datasets[0]['type'] = MBPPDatasetV2
|
| 74 |
+
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
| 75 |
+
mbpp_datasets[0]['reader_cfg']['output_column'] = 'test_column'
|
| 76 |
+
|
| 77 |
+
datasets = []
|
| 78 |
+
datasets += humaneval_datasets
|
| 79 |
+
datasets += mbpp_datasets
|
| 80 |
+
|
| 81 |
+
models = [
|
| 82 |
+
dict(
|
| 83 |
+
type=HuggingFaceCausalLM,
|
| 84 |
+
...,
|
| 85 |
+
generation_kwargs=dict(
|
| 86 |
+
do_sample=True,
|
| 87 |
+
top_p=0.95,
|
| 88 |
+
temperature=0.8,
|
| 89 |
+
),
|
| 90 |
+
...,
|
| 91 |
+
)
|
| 92 |
+
]
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
由于数据集的prompt并没有修改,我们需要替换对应的字段来达到数据集重复的目的。
|
| 96 |
+
需要修改以下字段:
|
| 97 |
+
|
| 98 |
+
- `num_repeats`: 数据集重复的次数
|
| 99 |
+
- `abbr`: 数据集的缩写最好随着重复次数一并修改,因为数据集数量会发生变化,防止与`.cache/dataset_size.json` 中的数值出现差异导致一些潜在的问题。
|
| 100 |
+
|
| 101 |
+
对于 `mbpp`,同样修改`type`, `eval_cfg.evaluator.type`, `reader_cfg.output_column` 字段。
|
| 102 |
+
|
| 103 |
+
另外我们需要模型的回复有随机性,同步需要设置`generation_kwargs`参数。
|
| 104 |
+
|
| 105 |
+
具体可以参考以下配置文件
|
| 106 |
+
[configs/eval_code_passk_repeat_dataset.py](https://github.com/open-compass/opencompass/blob/main/configs/eval_code_passk_repeat_dataset.py)
|
opencompass/docs/zh_cn/advanced_guides/code_eval_service.md
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 代码评测Docker教程
|
| 2 |
+
|
| 3 |
+
为了完成LLM代码能力评测,我们需要搭建一套独立的评测环境,避免在开发环境执行错误代码从而造成不可避免的损失。目前 OpenCompass 使用的代码评测服务可参考[code-evaluator](https://github.com/open-compass/code-evaluator)项目。接下来将围绕代码评测服务介绍不同需要下的评测教程。
|
| 4 |
+
|
| 5 |
+
1. humaneval-x
|
| 6 |
+
|
| 7 |
+
多编程语言的数据集 [humaneval-x](https://huggingface.co/datasets/THUDM/humaneval-x)
|
| 8 |
+
数据集[下载地址](https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx),请下载需要评测的语言(××.jsonl.gz)文件,并放入`./data/humanevalx`文件夹。
|
| 9 |
+
|
| 10 |
+
目前支持的语言有`python`, `cpp`, `go`, `java`, `js`。
|
| 11 |
+
|
| 12 |
+
2. DS1000
|
| 13 |
+
|
| 14 |
+
Python 多算法库数据集 [ds1000](https://github.com/xlang-ai/DS-1000)
|
| 15 |
+
数据集[下载地址](https://github.com/xlang-ai/DS-1000/blob/main/ds1000_data.zip)
|
| 16 |
+
|
| 17 |
+
目前支持的算法库有`Pandas`, `Numpy`, `Tensorflow`, `Scipy`, `Sklearn`, `Pytorch`, `Matplotlib`。
|
| 18 |
+
|
| 19 |
+
## 启动代码评测服务
|
| 20 |
+
|
| 21 |
+
1. 确保您已经安装了 docker,可参考[安装docker文档](https://docs.docker.com/engine/install/)
|
| 22 |
+
2. 拉取代码评测服务项目,并构建 docker 镜像
|
| 23 |
+
|
| 24 |
+
选择你需要的数据集对应的dockerfile,在下面命令中做替换 `humanevalx` 或者 `ds1000`。
|
| 25 |
+
|
| 26 |
+
```shell
|
| 27 |
+
git clone https://github.com/open-compass/code-evaluator.git
|
| 28 |
+
docker build -t code-eval-{your-dataset}:latest -f docker/{your-dataset}/Dockerfile .
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
3. 使用以下命令创建容器
|
| 32 |
+
|
| 33 |
+
```shell
|
| 34 |
+
# 输出日志格式
|
| 35 |
+
docker run -it -p 5000:5000 code-eval-{your-dataset}:latest python server.py
|
| 36 |
+
|
| 37 |
+
# 在后台运行程序
|
| 38 |
+
# docker run -itd -p 5000:5000 code-eval-{your-dataset}:latest python server.py
|
| 39 |
+
|
| 40 |
+
# 使用不同的端口
|
| 41 |
+
# docker run -itd -p 5001:5001 code-eval-{your-dataset}:latest python server.py --port 5001
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
**注:**
|
| 45 |
+
|
| 46 |
+
- 如在评测Go的过程中遇到timeout,请在创建容器时候使用以下命令
|
| 47 |
+
|
| 48 |
+
```shell
|
| 49 |
+
docker run -it -p 5000:5000 -e GO111MODULE=on -e GOPROXY=https://goproxy.io code-eval-{your-dataset}:latest python server.py
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
4. 为了确保您能够访问服务,通过以下命令检测推理环境和评测服务访问情况。 (如果推理和代码评测在同一主机中运行服务,就跳过这个操作)
|
| 53 |
+
|
| 54 |
+
```shell
|
| 55 |
+
ping your_service_ip_address
|
| 56 |
+
telnet your_service_ip_address your_service_port
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## 本地代码评测
|
| 60 |
+
|
| 61 |
+
模型推理和代码评测服务在同一主机,或者同一局域网中,可以直接进行代码推理及评测。**注意:DS1000暂不支持,请走异地评测**
|
| 62 |
+
|
| 63 |
+
### 配置文件
|
| 64 |
+
|
| 65 |
+
我们已经提供了 huamaneval-x 在 codegeex2 上评估的\[配置文件\]作为参考(https://github.com/open-compass/opencompass/blob/main/configs/eval_codegeex2.py)。
|
| 66 |
+
其中数据集以及相关后处理的配置文件为这个[链接](https://github.com/open-compass/opencompass/tree/main/configs/datasets/humanevalx), 需要注意 humanevalx_eval_cfg_dict 中的evaluator 字段。
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 70 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
| 71 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
| 72 |
+
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
|
| 73 |
+
|
| 74 |
+
humanevalx_reader_cfg = dict(
|
| 75 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
| 76 |
+
|
| 77 |
+
humanevalx_infer_cfg = dict(
|
| 78 |
+
prompt_template=dict(
|
| 79 |
+
type=PromptTemplate,
|
| 80 |
+
template='{prompt}'),
|
| 81 |
+
retriever=dict(type=ZeroRetriever),
|
| 82 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
| 83 |
+
|
| 84 |
+
humanevalx_eval_cfg_dict = {
|
| 85 |
+
lang : dict(
|
| 86 |
+
evaluator=dict(
|
| 87 |
+
type=HumanevalXEvaluator,
|
| 88 |
+
language=lang,
|
| 89 |
+
ip_address="localhost", # replace to your code_eval_server ip_address, port
|
| 90 |
+
port=5000), # refer to https://github.com/open-compass/code-evaluator to launch a server
|
| 91 |
+
pred_role='BOT')
|
| 92 |
+
for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
humanevalx_datasets = [
|
| 96 |
+
dict(
|
| 97 |
+
type=HumanevalXDataset,
|
| 98 |
+
abbr=f'humanevalx-{lang}',
|
| 99 |
+
language=lang,
|
| 100 |
+
path='./data/humanevalx',
|
| 101 |
+
reader_cfg=humanevalx_reader_cfg,
|
| 102 |
+
infer_cfg=humanevalx_infer_cfg,
|
| 103 |
+
eval_cfg=humanevalx_eval_cfg_dict[lang])
|
| 104 |
+
for lang in ['python', 'cpp', 'go', 'java', 'js']
|
| 105 |
+
]
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
### 任务启动
|
| 109 |
+
|
| 110 |
+
参考[快速上手教程](../get_started.html)
|
| 111 |
+
|
| 112 |
+
## 异地代码评测
|
| 113 |
+
|
| 114 |
+
模型推理和代码评测服务分别在不可访问的不同机器中,需要先进行模型推理,收集代码推理结果。配置文件和推理流程都可以复用上面的教程。
|
| 115 |
+
|
| 116 |
+
### 收集推理结果(仅针对Humanevalx)
|
| 117 |
+
|
| 118 |
+
OpenCompass 在 `tools` 中提供了 `collect_code_preds.py` 脚本对推理结果进行后处理并收集,我们只需要提供启动任务时的配置文件,以及指定复用对应任务的工作目录,其配置与 `run.py` 中的 `-r` 一致,细节可参考[文档](https://opencompass.readthedocs.io/zh-cn/latest/get_started/quick_start.html#id4)。
|
| 119 |
+
|
| 120 |
+
```shell
|
| 121 |
+
python tools/collect_code_preds.py [config] [-r latest]
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
收集到的结果将会按照以下的目录结构保存到 `-r` 对应的工作目录中:
|
| 125 |
+
|
| 126 |
+
```
|
| 127 |
+
workdir/humanevalx
|
| 128 |
+
├── codegeex2-6b
|
| 129 |
+
│ ├── humanevalx_cpp.json
|
| 130 |
+
│ ├── humanevalx_go.json
|
| 131 |
+
│ ├── humanevalx_java.json
|
| 132 |
+
│ ├── humanevalx_js.json
|
| 133 |
+
│ └── humanevalx_python.json
|
| 134 |
+
├── CodeLlama-13b
|
| 135 |
+
│ ├── ...
|
| 136 |
+
├── CodeLlama-13b-Instruct
|
| 137 |
+
│ ├── ...
|
| 138 |
+
├── CodeLlama-13b-Python
|
| 139 |
+
│ ├── ...
|
| 140 |
+
├── ...
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
对于 DS1000 只需要拿到 `opencompasss` 对应生成的 prediction文件即可。
|
| 144 |
+
|
| 145 |
+
### 代码评测
|
| 146 |
+
|
| 147 |
+
#### 以下仅支持Humanevalx
|
| 148 |
+
|
| 149 |
+
确保代码评测服务启动的情况下,使用 `curl` 提交请求:
|
| 150 |
+
|
| 151 |
+
```shell
|
| 152 |
+
curl -X POST -F 'file=@{result_absolute_path}' -F 'dataset={dataset/language}' {your_service_ip_address}:{your_service_port}/evaluate
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
例如:
|
| 156 |
+
|
| 157 |
+
```shell
|
| 158 |
+
curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' localhost:5000/evaluate
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
得到结果:
|
| 162 |
+
|
| 163 |
+
```
|
| 164 |
+
"{\"pass@1\": 37.19512195121951%}"
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
另外我们额外提供了 `with-prompt` 选项(默认为True),由于有些模型生成结果包含完整的代码(如WizardCoder),不需要 prompt + prediciton 的形式进行拼接,可以参考以下命令进行评测。
|
| 168 |
+
|
| 169 |
+
```shell
|
| 170 |
+
curl -X POST -F 'file=@./examples/humanevalx/python.json' -F 'dataset=humanevalx/python' -H 'with-prompt: False' localhost:5000/evaluate
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
#### 以下仅支持DS1000
|
| 174 |
+
|
| 175 |
+
确保代码评测服务启动的情况下,使用 `curl` 提交请求:
|
| 176 |
+
|
| 177 |
+
```shell
|
| 178 |
+
curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' localhost:5000/evaluate
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
DS1000支持额外 debug 参数,注意开启之后会有大量log
|
| 182 |
+
|
| 183 |
+
- `full`: 额外打印每个错误样本的原始prediction,后处理后的predcition,运行程序以及最终报错。
|
| 184 |
+
- `half`: 额外打印每个错误样本的运行程序以及最终报错。
|
| 185 |
+
- `error`: 额外打印每个错误样本的最终报错。
|
| 186 |
+
|
| 187 |
+
```shell
|
| 188 |
+
curl -X POST -F 'file=@./internlm-chat-7b-hf-v11/ds1000_Numpy.json' -F 'debug=error' localhost:5000/evaluate
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
另外还可以通过同样的方式修改`num_workers`来控制并行数。
|
| 192 |
+
|
| 193 |
+
## 进阶教程
|
| 194 |
+
|
| 195 |
+
除了评测已支持的 `humanevalx` 数据集以外,用户还可能有以下需求:
|
| 196 |
+
|
| 197 |
+
### 支持新数据集
|
| 198 |
+
|
| 199 |
+
可以参考[支持新数据集教程](./new_dataset.md)
|
| 200 |
+
|
| 201 |
+
### 修改后处理
|
| 202 |
+
|
| 203 |
+
1. 本地评测中,可以按照支持新数据集教程中的后处理部分来修改后处理方法;
|
| 204 |
+
2. 异地评测中,可以修改 `tools/collect_code_preds.py` 中的后处理部分;
|
| 205 |
+
3. 代码评测服务中,存在部分后处理也可以进行修改,详情参考下一部分教程;
|
| 206 |
+
|
| 207 |
+
### 代码评测服务 Debug
|
| 208 |
+
|
| 209 |
+
在支持新数据集或者修改后处理的过程中,可能会遇到需要修改原本的代码评测服务的情况,按照需求修改以下部分
|
| 210 |
+
|
| 211 |
+
1. 删除 `Dockerfile` 中安装 `code-evaluator` 的部分,在启动容器时将 `code-evaluator` 挂载
|
| 212 |
+
|
| 213 |
+
```shell
|
| 214 |
+
docker run -it -p 5000:5000 -v /local/path/of/code-evaluator:/workspace/code-evaluator code-eval:latest bash
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
2. 安装并启动代码评测服务,此时可以根据需要修改本地 `code-evaluator` 中的代码来进行调试
|
| 218 |
+
|
| 219 |
+
```shell
|
| 220 |
+
cd code-evaluator && pip install -r requirements.txt
|
| 221 |
+
python server.py
|
| 222 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/compassbench_intro.md
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CompassBench 介绍
|
| 2 |
+
|
| 3 |
+
## CompassBench 2.0 v1.3 版本
|
| 4 |
+
|
| 5 |
+
CompassBench(官方自建榜单)经历了多次更新迭代,从2024年7月起,OpenCompass将会公布自建榜单的评测规则(评测配置文件)和示例数据集文件,以帮助社区更好的了解自建榜单的评测逻辑和方法。
|
| 6 |
+
|
| 7 |
+
### 能力维度
|
| 8 |
+
|
| 9 |
+
2024年8月榜单将会包括以下能力维度:
|
| 10 |
+
|
| 11 |
+
| 能力 | 任务介绍 | 评测方式 | 示例数据地址 |
|
| 12 |
+
| -------- | -------------------------------------------------------------------------------------- | ------------------- | ------------------------------------------------------------------------------ |
|
| 13 |
+
| 语言 | 评测模型在信息抽取、信息抽取、内容总结、对话、创作等多种任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/language |
|
| 14 |
+
| 推理 | 评测模型在逻辑推理、常识推理、表格推理等多种日常推理任务上的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/reasoning |
|
| 15 |
+
| 知识 | 评测模型在理科、工科、人文社科等多个领域的知识水平 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/knowledge |
|
| 16 |
+
| 数学 | 评测模型在数值计算、高中及大学难度的数学问题上的能力 | 客观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/math |
|
| 17 |
+
| 代码 | 评测模型在代码生成、代码补全、代码注释、代码重构、代码改写、计算机知识综合问答上的能力 | 客观评测 + 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/code |
|
| 18 |
+
| 指令跟随 | 评测模型在基于各类语言、推理、知识等任务中,能否准确遵循复杂指令的能力 | 主观评测 | https://github.com/open-compass/CompassBench/tree/main/v1_3_data/instruct |
|
| 19 |
+
| 智能体 | 评估模型在复杂工具调用的能力,以及数据科学、数学等情况下使用代码解释器的能力 | 客观评测 | https://github.com/open-compass/T-Eval https://github.com/open-compass/CIBench |
|
| 20 |
+
|
| 21 |
+
### 评测方法
|
| 22 |
+
|
| 23 |
+
- 对于客观评测,将会采用0-shot + CoT的方式评测。
|
| 24 |
+
- OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误
|
| 25 |
+
- 数学、智能体题目类型与给定的示例数据类似,但真实评测数据与开源数据不同
|
| 26 |
+
- 对于主观评测,将会采用基于大模型评价的方式进行评测。
|
| 27 |
+
- 我们对每一道问题均提供评测时的打分指引。
|
| 28 |
+
|
| 29 |
+
- 比较待测模型相对于参考回复的胜率,共设置为五档
|
| 30 |
+
|
| 31 |
+
- `A++`:回答A远胜于回答B。
|
| 32 |
+
- `A+`:回答A略优于回答B。
|
| 33 |
+
- `A=B`:回答A和回答B质量相同。
|
| 34 |
+
- `B+`:回答B略优于回答A。
|
| 35 |
+
- `B++`:回答B远胜于回答A。
|
| 36 |
+
- 主观评测配置文件
|
| 37 |
+
- [示例评测配置](https://github.com/open-compass/opencompass/blob/main/configs/eval_compassbench_v1_3_subjective.py)
|
| 38 |
+
- 主观评价提示词
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
# Instruction
|
| 43 |
+
|
| 44 |
+
You are an expert evaluator. Your task is to evaluate the quality of the \
|
| 45 |
+
responses generated by two AI models.
|
| 46 |
+
We will provide you with the user query and a pair of AI-generated \
|
| 47 |
+
responses (Response A and Response B).
|
| 48 |
+
You should first read the user query and the conversation history \
|
| 49 |
+
carefully for analyzing the task, and then evaluate the quality of the \
|
| 50 |
+
responses based on and rules provided below.
|
| 51 |
+
|
| 52 |
+
# Conversation between User and AI
|
| 53 |
+
|
| 54 |
+
## User Query
|
| 55 |
+
<|begin_of_query|>
|
| 56 |
+
|
| 57 |
+
{question}
|
| 58 |
+
|
| 59 |
+
<|end_of_query|>
|
| 60 |
+
|
| 61 |
+
## Response A
|
| 62 |
+
<|begin_of_response_A|>
|
| 63 |
+
|
| 64 |
+
{prediction}
|
| 65 |
+
|
| 66 |
+
<|end_of_response_A|>
|
| 67 |
+
|
| 68 |
+
## Response B
|
| 69 |
+
<|begin_of_response_B|>
|
| 70 |
+
|
| 71 |
+
{prediction2}
|
| 72 |
+
|
| 73 |
+
<|end_of_response_B|>
|
| 74 |
+
|
| 75 |
+
# Evaluation
|
| 76 |
+
|
| 77 |
+
## Checklist
|
| 78 |
+
|
| 79 |
+
<|begin_of_checklist|>
|
| 80 |
+
|
| 81 |
+
{checklist}
|
| 82 |
+
|
| 83 |
+
<|end_of_checklist|>
|
| 84 |
+
|
| 85 |
+
Please use this checklist to guide your evaluation, but do not limit your \
|
| 86 |
+
assessment to the checklist.
|
| 87 |
+
|
| 88 |
+
## Rules
|
| 89 |
+
|
| 90 |
+
You should compare the above two responses based on your analysis of the \
|
| 91 |
+
user queries and the conversation history.
|
| 92 |
+
You should first write down your analysis and the checklist that you used \
|
| 93 |
+
for the evaluation, and then provide your assessment according to the \
|
| 94 |
+
checklist.
|
| 95 |
+
There are five choices to give your final assessment: ["A++", "A+", \
|
| 96 |
+
"A=B", "B+", "B++"], which correspond to the following meanings:
|
| 97 |
+
|
| 98 |
+
- `A++`: Response A is much better than Response B.
|
| 99 |
+
- `A+`: Response A is only slightly better than Response B.
|
| 100 |
+
- `A=B`: Response A and B are of the same quality. Please use this \
|
| 101 |
+
choice sparingly.
|
| 102 |
+
- `B+`: Response B is only slightly better than Response A.
|
| 103 |
+
- `B++`: Response B is much better than Response A.
|
| 104 |
+
|
| 105 |
+
## Output Format
|
| 106 |
+
First, please output your analysis for each model response, and \
|
| 107 |
+
then summarize your assessment to three aspects: "reason A=B", \
|
| 108 |
+
"reason A>B", and "reason B>A", and finally make your choice for \
|
| 109 |
+
the final assessment.
|
| 110 |
+
|
| 111 |
+
Please provide your evaluation results in the following json \
|
| 112 |
+
format by filling in the placeholders in []:
|
| 113 |
+
|
| 114 |
+
{
|
| 115 |
+
"analysis of A": "[analysis of Response A]",
|
| 116 |
+
"analysis of B": "[analysis of Response B]",
|
| 117 |
+
"reason of A=B": "[where Response A and B perform equally well]",
|
| 118 |
+
"reason of A>B": "[where Response A is better than Response B]",
|
| 119 |
+
"reason of B>A": "[where Response B is better than Response A]",
|
| 120 |
+
"choice": "[A++ or A+ or A=B or B+ or B++]",
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# 指令
|
| 125 |
+
|
| 126 |
+
您是一位专业评估专家。您的任务是评估两个AI模型生成回答的质量。
|
| 127 |
+
我们将为您提供用户问题及一对AI生成的回答(回答A和回答B)。
|
| 128 |
+
您应当首先仔细阅读用户问题,然后根据以下提供的规则评估回答的质量。
|
| 129 |
+
|
| 130 |
+
# 用户与AI之间的对话
|
| 131 |
+
|
| 132 |
+
## 用户问题
|
| 133 |
+
<|begin_of_query|>
|
| 134 |
+
|
| 135 |
+
{question}
|
| 136 |
+
|
| 137 |
+
<|end_of_query|>
|
| 138 |
+
|
| 139 |
+
## 回答A
|
| 140 |
+
<|begin_of_response_A|>
|
| 141 |
+
|
| 142 |
+
{prediction}
|
| 143 |
+
|
| 144 |
+
<|end_of_response_A|>
|
| 145 |
+
|
| 146 |
+
## 回答B
|
| 147 |
+
<|begin_of_response_B|>
|
| 148 |
+
|
| 149 |
+
{prediction2}
|
| 150 |
+
|
| 151 |
+
<|end_of_response_B|>
|
| 152 |
+
|
| 153 |
+
# 评估
|
| 154 |
+
|
| 155 |
+
## 检查清单
|
| 156 |
+
|
| 157 |
+
<|begin_of_checklist|>
|
| 158 |
+
|
| 159 |
+
{checklist}
|
| 160 |
+
|
| 161 |
+
<|end_of_checklist|>
|
| 162 |
+
|
| 163 |
+
请参考此检查清单来评估回答的质量,但不要局限于此检查清单。
|
| 164 |
+
|
| 165 |
+
## 规则
|
| 166 |
+
|
| 167 |
+
您应当基于用户查询,分析比较上述两种回答。
|
| 168 |
+
您应当基于检查清单写下您的分析,然后提供您的评价。
|
| 169 |
+
有五个选项供您做出最终评估:["A++", "A+", "A=B", "B+", "B++"],它们对应如下含义:
|
| 170 |
+
|
| 171 |
+
- `A++`:回答A远胜于回答B。
|
| 172 |
+
- `A+`:回答A略优于回答B。
|
| 173 |
+
- `A=B`:回答A和回答B质量相同。请谨慎使用此选项。
|
| 174 |
+
- `B+`:回答B略优于回答A。
|
| 175 |
+
- `B++`:回答B远胜于回答A。
|
| 176 |
+
|
| 177 |
+
## 输出格式
|
| 178 |
+
首先,请输出您对每个模型回答的分析,
|
| 179 |
+
然后总结您的评估到三个方面:"A=B的理由","A优于B的理由",和 "B优于A的理由",
|
| 180 |
+
最后做出您对最终评估的选择。
|
| 181 |
+
|
| 182 |
+
请按照以下json格式提供您的评估结果,通过填充[]中的占位符:
|
| 183 |
+
|
| 184 |
+
{
|
| 185 |
+
"回答A的分析": "[回答A的分析]",
|
| 186 |
+
"回答B的分析": "[回答B的分析]",
|
| 187 |
+
"A=B的理由": "[A和B回答差不多的理由]",
|
| 188 |
+
"A优于B的理由": "[回答A优于B的理由]",
|
| 189 |
+
"B优于A的理由": "[回答B优于A的理由]",
|
| 190 |
+
"choice": "[A++ or A+ or A=B or B+ or B++]",
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/compassbench_v2_0.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CompassBench 2.0 介绍
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
## v1.0介绍
|
| 5 |
+
为支持OpenCompass的年度榜单,本文将提供CompassBench的整体介绍。
|
| 6 |
+
|
| 7 |
+
本次评测将在语言、知识、创作、推理、数学、代码、长文本、智能体能力的多项任务上开展评测,现提供任务介绍和题目示例。
|
| 8 |
+
|
| 9 |
+
- 评测方式采样主观与客观相结合的方式,具体根据各个任务不同进行具体设计。
|
| 10 |
+
- 针对推理、数学、代码、智能体等任务,将会采用Few-shot + CoT的评测方式。
|
| 11 |
+
- 对于填空题,通过在Prompt中提供Few-shot和输出格式约束来协助抽取答案。
|
| 12 |
+
- 对于选择题,针对同一问题,通过变换提问方式,减少随机影响。
|
| 13 |
+
- 对于开放式问题的评测,对同一问题进行多次采样,并采用多维度打分的方式进行评价。
|
| 14 |
+
|
| 15 |
+
> OpenCompass在客观题评测的后处理上已进行较多优化,并在评测时在Prompt中对回答格式进行约束,对于因指令跟随问题带来的无法完成答案提取的情况,将视为回答错误。OpenCompass将会在下一期加入指令跟随能力的评测。
|
| 16 |
+
|
| 17 |
+
| 能力 | 任务 | 介绍 | 题目示例 |
|
| 18 |
+
| ---- | ---- | ---- | ---- |
|
| 19 |
+
| 语言 | 信息抽取 | 信息抽取是指从文本中提取出特定类型的信息。这类任务通常用于处理结构化数据、知识图谱、问答系统等场景。 | ```"question": "野马队在分区轮以 23–16 击败了匹兹堡钢人队,在比赛的最后三分钟拿下 11 分。然后他们在美式足球联合会 (AFC) 锦标赛上以 20–18 击败了第 49 届超级碗卫冕冠军新英格兰爱国者队,在比赛还剩 17 秒 时拦截了新英格兰队的两分转换传球。尽管曼宁在本赛季的拦截上有问题,但他在两场季后赛中未投任何球。\n野马队在 AFC 锦标赛中打败了谁?"``` |
|
| 20 |
+
| 语言 | 意图识别 | 意图识别是对用户输入的文本或语音进行分析,判断其意图或需求。这类任务应用于智能客服、语音助手、聊天机器人等场景。 | ```"question": "中国文化的天人合一思想\n中西文化的基本差异之一就是,在人与自然的关系问题上,中国文化比较重视人与自然的和谐统一,而西方文化则强调,人要征服自然、改造自然才能求得自己的生存和发展。中国文化的这种特色,有时通过“天人合一”的命题表述出来。中国古代思想家一般都反对把天与人割裂开来、对立起来,而主张天人协调、天人合一。\n天人合一问题,就其理论实质而言,是关于人与自然的统一问题,或者说是自然界和精神的统一问题。应当承认,中国传统文化中的天人合一思想,内容十分复杂,其中既有正确的观点,也有错误的观点,我们必须实事求是地予以分析。但是,从文化的民族性以及对民族文化的推进作用和深远影响看,我们应当大胆肯定。中国古代思想家关于天人合一的思想,其最基本的涵义,就是充分肯定自然界和精神的统一,关注人类行为与自然界的协调问题。从这个意思上说,天人合一思想的,是非常有价值的。\n恩格斯对自然和精神的统一问题,有过一系列精辟的论述。他说:“我们一天天地学会更加正确地理解自然规律,学会认识我们对于自然界的惯常行程的干涉所引起的比较近或比较远的影响。”他还说:“自然界和精神是统一的。自然界不能是无理性的……而理性是不能和自然界矛盾的。”“思维规律和自然规律,只要它们被正确地认识,必然是互相一致的。”恩格斯的这些论述,深刻地揭示了自然和精神统一问题的丰富内涵。根据恩格斯的这些论述,考察中国古代的天人合一思想,不难看出,这种思想有着深刻的合理性。\n中国古代的天人合一思想,强调人与自然的统一,人的行为与自然的协调,道德理性与自然理性的一致,充分显示了中国古代思想家对于主客体之间、主观能动性和客观规律之间关系的辩证思考。根据这种思想,人不能违背自然规律,不能超越自然界的承受力去改造自然、征服自然、破坏自然,而只能在顺从自然规律的条件下去利用自然、调整自然,使之更符合人类的需要,也使自然界的万物都能生长发展。另一方面,自然界也不是主宰人其社会的神秘力量,而是可以认识、可以为我所用的客观对象。这种思想长期实践的结果,是达到自然界与人的统一,人的精神、行为与外在自然的统一,自我身心平衡与自然环境平衡的统一,以及由于这些统一而达到的天道与人道的统一,从而实现完满和谐的精神追求。中国文化的天人合一思想,对于解决当今世界由于工业化和无限制地征服自然而带来的自然环境被污染、生态平衡遭破坏等问题,具有重要的启迪意义;对于我们今天正在进行的社会主���现代化建设,更有着防患于未然的重大现实意义。\n(选自张岱年等主编的《中国文化概论》,有删改)\n根据原文提供的信息,下列推断不正确的一项是","A": "对人与自然关系的认识,中国古代天人合一思想有优于西方文化的地方。","B": "现代人重视和研究天人合一思想,是基于对现实及发展问题的思考。", "C": "肯定天人合一思想的合理性,并不意味着对其思想内容的全盘接受。", "D": "以天人合一思想为指导,可解决当今世界因工业化带来的各种社会问题。",``` |
|
| 21 |
+
| 语言 | 情感分析 | 情感分析是对文本中的情感或情绪进行识别和分析的任务。这类任务可用于情感倾向分析场景。例如,分析社交媒体上的用户评论,了解新闻或事件的倾向。| ```"question": "请问以下评价是正面评价还是负面评价?\n大众点评网的霸王餐,200份华辉拉肠双人试吃,员村一店是已经有经营两年以上的,年前装修过,干净齐整,下单的服务员亲切有礼,可能我是第一个用代码验证的,中间拖了点时间去验证,幸好周日10点左右没有平时的多人。拉肠一如既往的滑,皮蛋瘦肉粥很绵,皮蛋瘦肉超多,肉肠是一底带肉一底斋肠,以前没吃过鸡蛋肠觉得6蚊不太划算,现在发现是有三底肠粉的哦,不太喜欢吃肉的可以试下,很饱肚,鼓油是吃过这么多家肠粉店味道调得最好的。","A": "正面评价", "B": "负面评价"```|
|
| 22 |
+
| 语言 | 内容总结 | 内容总结是将一篇较长的文本压缩成一篇简短的概括性摘要。这类任务适用于需要快速了解文档核心内容的情境,如新闻标题、电子邮件摘要 | ```联合国减灾办公室负责人格拉瑟。联合国减灾办公室2016年2月11日联合国减灾办公室今天表示,2015年是有记录以来最热的一个年份,在这一年当中,自然灾害影响了近1亿人口。减灾办公室呼吁各国采取行动,应对气候变化,在最大程度上做出努力,防止和减少灾害的发生。联合国减灾办公室所公布的最新数据显示,在过去一年当中,受到灾害影响最重的国家都在亚洲,它们是中国、印度、菲律宾和印度尼西亚。自然灾害共导致2万2000人死亡,带来的经济损失约合660亿美元。然而,尽管这一数字惊人,但却低于1400亿的10年平均数字。其中的部分原因是各国政府采取了更好的防范措施。数据显示,2015年有5000万人深受旱灾之苦,增幅达40%。联合国减灾办公室负责人格拉瑟表示,2015年是记载中最热的一个年份,成因是气候变化和厄尔尼诺天气现象。他指出,最令人感到不安的一个趋势是2015年有记录的主要干旱增加了一倍。他强调,数据表明,减少温室气体排放和适应气候变化对于减少灾害风险至关重要。```|
|
| 23 |
+
| 语言 | 内容评价 | 内容评价是对文本的质量、价值或观点进行判断和评价的任务。这类任务可用于评论筛选、观点挖掘等场景。 | ```"question": "以下是一个问题以及针对该问题的两个答案,哪个答案更好?\n问题:创建一篇1000字的非剽窃新闻文章,关于任天堂将于2月8日星期三播出新的任天堂直面会,承诺将公布即将推出的Switch游戏的新细节。2月的任天堂直面会将在东部时间下午5点/太平洋时间下午2点,在公司的YouTube频道上直播。\n\n任天堂表示,星期三的任天堂直面会将持续“大约”40分钟,并将重点放在即将在2023年上半年推出的Nintendo Switch游戏上。\n\n任天堂宣布的Nintendo Switch游戏阵容包括《星之卡比:梦之泉豪华版》,这是2011年Wii游戏的重制版;《魔兵雅各:樱花与失落的恶魔》,这是一个以童话为灵感的冒险故事,主角是年轻的魔兵雅各;《塞尔达传说:王国之泪》,是《荒野之息》的续作;以及《皮克敏4》,这是一款长期运行的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂的计划中,但日期尚未确定的还有《超级战争大战1+2:重启营》,这是两款Game Boy Advance时代的《超级战争》游戏的重制版。经过长时间的延迟,Switch版的《超级战争》合集据传很快就要推出。当然,任天堂在2023年还有其他一些大计划,包括2月17日在好莱坞环球影城开设的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。格斗游戏确实很酷,但可能会让人望而生畏。你可能会看到铁拳或街霸这样的竞技世界,然后想,“我可能很菜!而且没有正确的装备!而且我不懂这些术语!”但是停下来。你可以进入格斗游戏。对Pat Gill来说,这段旅程真的很有意义。以下是他如何拥抱格斗游戏,以及你也可以。\n答案 1:任天堂粉丝们正急切地期待着最新的任天堂直面发布会,该发布会定于2月8日星期三播出。该发布会将在公司的YouTube频道上以东部时间下午5点/太平洋时间下午2点的直播形式进行,承诺提供有关任天堂Switch即将推出的游戏的令人兴奋的新细节。\n\n据任天堂称,直面发布会将持续大约40分钟,专门关注2023年上半年将为Switch发布的游戏。已经宣布的一些游戏标题包括《星之卡比:梦之泉DX》、《魔兽传说:樱花与失落的恶魔》、《塞尔达传说:王国的泪滴》和《皮克敏4》。\n\n任天堂日程表上最受期待的游戏之一是《超级战争大战1+2:重启训练营》,这是两款经典Game Boy Advance游戏的重制版。尽管尚未正式宣布发行日期,但有传言称游戏将在经过长时间延迟后很快推出。\n\n除了游戏阵容外,任天堂还计划在2023年推出许多其他令人兴奋的项目,包括2月17日在好莱坞环球影城开放的超级任天堂乐园主题公园,以及4月7日上映的《超级马里奥兄弟》电影。\n\n对于格斗游戏爱好者来说,任天堂直面发布会承诺将是一个令人兴奋的活动,将提供任天堂Switch最新和最伟大游戏的一瞥。无论您是经验丰富的玩家还是刚刚开始,直面发布会中肯定会有适合每个人的内容。因此,请务必在2月8日星期三收看,以获取来自任天堂的所有最新消息和更新!\n答案 2:任天堂宣布将于2月8日星期三下午5点EST/下午2点PST,在公司的YouTube频道上播出新的任天堂Direct演示。演示将持续大约40分钟,并将提供有关2023年上半年即将推出的任天堂Switch游戏的新细节。\n\n公司宣布的任天堂Switch阵容包括《星之卡比:梦之返乡豪华版》,这是2011年Wii游戏的重制版;《魔兽师起源:赛瑞莎与失落的恶魔》,一个以童话为灵感的冒险,主角是年轻的贝约奈塔;《塞尔达传说:王国之泪》,《荒野之息》的续作;以及《皮克敏4》,这是一款长寿的策略/动作系列的新作,仍然有些神秘。\n\n此外,任天堂还有一些其他大计划,包括在2023年2月17日在好莱坞环球影城开设超级任天堂乐园主题公园,以及于4月7日上映《超级马里奥兄弟电影》。\n\n格斗游戏是一种受欢迎的游戏类型,可能是一种令人望而生畏的爱好。然而,人们是可以享受格斗游戏的,Pat Gill就是如何拥抱这种爱好的一个很好的例子。他从一个初学者开始,发现这是一段有意义的旅程。只要有正确的心态和资源,任何人都可以参与格斗游戏,并享受它们所提供的刺激和竞争。" ``` |
|
| 24 |
+
| 语言 | 多语言翻译 | 多语言翻译是将一种语言的文本转换为另一种语言的文本。这类任务适用于跨语言沟通、在线翻译等场景。|```"question": "Translate the following sentence from English to French: \"He [Wales] basically lied to us from the start. First, by acting as if this was for legal reasons. Second, by pretending he was listening to us, right up to his art deletion."```|
|
| 25 |
+
| 语言 | 中华传统文化理解 | 中华传统文化涉及对中国古代文学、艺术、哲学、历史等领域的研究 | ``` "question": "王实甫在《西厢记》中写道:“淋漓襟袖啼红泪,比司马青衫更湿”,其中“司马青衫”指的是什么"``` |
|
| 26 |
+
| 语言 | 中文语意理解 | 中文语意理解涉及理解文本中的词汇、短语和句子之间的语义关系,包括但不限于近义词、反义词、整体-部分关系、修饰关系等。 |``` "question": "“繁荣”与以下哪个词具有近义关系?", "A": "盛世", "B": "荣誉", "C": "繁花", "D": "昌盛"```|
|
| 27 |
+
| 语言 | 多轮对话 | 评价模型能否在多轮对话中保持上下文一致性和连贯性的能力,评估模型是否能够理解并记住对话的上下文信息,记住之前的对话内容。 |```[{'role': 'user','content': '我在做一项关于智能手机市场的研究,需要整理一些数据成 Markdown 表格。数据包括品牌名称、市场份额和热销型号。品牌有苹果、三星和华为。苹果的市场份额是30%,热销型号是iPhone 13;三星市场份额是25%,热销型号是Galaxy S21;华为市场份额是20%,热销型号是Mate 40。请帮我做一个表格。'},{'role': 'user','content': '看起来不错,不过我希望表格中的市场份额列展示为百分比和实际销量。苹果的销量是8000万部,三星是6000万部,华为是5000万部。'}, {'role': 'user', 'content': '很好。现在请把表格的标题中文改成英文,并且各列改成对齐方式:品牌列左对齐,市场份额列居中对齐,热销型号列右对齐。'},{'role': 'user', 'content': '可以,我注意到我们可能需要添加一列来表示这些品牌的总收入,苹果为500亿美元,三星为400亿美元,华为为350亿美元。此外,请按市场销量对行进行排序。'}]```|
|
| 28 |
+
| 知识 | 生活常识 | 考察普通社会上智力正常的人皆有或普遍拥有的���大众化的知识 | ```"question": "世界四大文明古国有哪些?```|
|
| 29 |
+
| 知识 | 自然科学(理科) | 关于自然现象的具体科学,研究自然界的本质和规律(理科):包括不限于数学,物理学,化学,生物学,天文学等 | ```"question": "群的研究对象是什么?"``` |
|
| 30 |
+
| 知识 | 自然科学(工科) | 关于自然现象的具体科学,研究自然界的本质和规律(工科):包括不限于计算机科学,医学,建筑学,材料学,机械学,测量学,气象学,环境学等 | ```"question": "下列关于信息安全的说法,正确的是( )。", "options": ["打开朋友转发的网页链接一定是安全的", "安装了杀毒软件后电脑就不会感染病毒", "数据加密是一种提高信息安全性的有效措施", "手机指纹识别技术能确保手机所有信息的安全"]``` |
|
| 31 |
+
| 知识 | 社会科学 | 研究社会现象的具体科学,力求揭示社会的本质和规律,例如经济学,政治学,军事学,社会学,管理学,教育学等。社会科学主要以人类社会的组织与结构、体制与关系、功能与效率、秩序与规范为研究认识之对象,并通过这种知识来为人类社会的有序管理、高效运作提供知识、理论和手段 | ```"question": "为了避免资金供应短缺和倒闭,企业经营者需要做什么?"``` |
|
| 32 |
+
| 知识 | 人文科学 | 设设计对人的问题的类型思考与情感体验,围绕着关乎人的心灵世界、关乎人的精神生命主题而展开的种种思想、观念、知识和理论的探索。它以人类自身,特别是人的内心情感世界为研究中心,以人自身的发展和完善作为学术探索的出发点和归宿。包括不限于文学,历史学、哲学、艺术、语言等 | ```"question": "光绪二十四年(1898)五月,维新派代表人物康有为从“中体西用”的角度论述了科举制度改革的必要性。这表明他( )", "options": ["在戊戌变法初期思想趋于保守", "认同洋务派的“中体西用”思想", "在教育改革方面与洋务派观点一致", "所说的“体”和“用”与洋务派不同"]``` |
|
| 33 |
+
| 创作 | 内容扩写 | 给定标题或者大纲的基础上,通过增加细节、描述和解释,使内容更加丰富、饱满和具有表现力。这种方法主要用于散文、小说等文学创作,以及学术论文、报告等实用文本 | ```请根据我给出的[外星人入侵、核弹、流亡]这些关键词来撰写一篇[科幻]题材的短篇故事。 \n故事需要拥有[引人入胜]的开头以及[反转]的结局,故事线[跌宕起伏]。\n注意请使用[刘慈欣]的写作风格为我撰写这篇故事。减少赘述,内容中不要有重复或意思相近的段落,大约800字``` |
|
| 34 |
+
| 创作 | 内容续写 | 现有文本的基础上,继续编写后面的内容。这种方法主要用于小说、故事等叙事性文本。续写部分通常要保持与原有文本的风格、情节和人物设定相一致,同时要求作者具备较强的想象力和创造力。 | ```题目:《新型能源技术在工业生产中的应用与效益》随着能源需求的不断增长和传统能源的有限性,新型能源技术在工业领域的应用备受瞩目。本文将着重探讨新型能源技术对工业生产的潜在影响,以及其在提高生产效益和减少环境影响方面的作用。请按照以上题目和摘要,完成一篇不少于1000字的论文``` |
|
| 35 |
+
| 创作 | 内容改写 | 不改变原文主题和基本结构的前提下,对文本进行一定程度的修改、重组和优化。这种方法主要用于修改学术论文、报告、文章等。内容改写的目的是提高文本的表达能力、逻辑性和可读性,同时避免重复。 | ```请帮我总结一封电子邮件的内容,总结需要包含以下四个部分:\n【重要性】根据内容判断事项是否重要,结果包含重要、不重要\n【紧急性】根据内容判断事项是否紧急,结果包含紧急、不紧急\n【核心内容】使用一句简短的话总结邮件最核心的内容。\n【需要回复内容】请判断邮件中哪些内容需要获得我的回复/确认,以列表形式呈现。\n 接下来,请根据下面邮件的内容,进行摘要:\n亲爱的全体员工:\n为了改善大家的身心健康,增强工作效率,公司特别安排了一场瑜伽兴趣培训,现将培训内容通知如下:\n日期及时间:8月15日(周六)上午9:00至11:00\n地点:公司三楼活动室(面积120平米,可容纳30人参加培训)\n培训内容:\n专业瑜伽教练将为大家进行基础的瑜伽技能和健康知识培训。 瑜伽是一种低强度有氧运动,适合各年龄层人群。它能够通过姿势练习、呼吸技巧等,改善身体的柔韧性和平衡感,帮助人体各系统更好地运行,有效减压提神。\n本次培训重点讲解:\n1)基本的瑜伽哲学及其健康效果介绍\n2)冥想和呼吸技巧演练\n3)10多个常见的基础瑜伽姿势示范及练习(包括猿人式、波浪式、斜 Supported Headstand 等)\n4)瑜伽练习时需要注意的安全事项\n5)瑜伽适宜穿戴的服装和个人物品\n6)参与培训后如何延续瑜伽运动\n培训具体流程:\n9:00-9:30 瑜伽基本概念介绍\n9:30-10:10 练习冥想、呼吸及基础姿势\n10:10-10:30 小休10分钟\n10:30-11:00 继续练习高难度姿势并解答问题\n如有意参加本次瑜伽兴趣培训,请于8月10日前用邮件或电话方式告知我们,我方将安排培训。\n若您有任何问题或建议,也欢迎与我联系。感谢您的收听与参与。```|
|
| 36 |
+
| 推理 | 逻辑推理 | 综合考察模型的几种常见逻辑推理模式:如演绎、归纳和溯因。 | ```"question": "在接下来的文本中,符号 -> 代表着一个简单的数学运算。\n695 - 472 -> 229\n222 - 62 -> 166\n689 - 439 -> ?",```|
|
| 37 |
+
| 推理 | 常识推理 | 常识推理是指基于日常生活中积累的知识和经验,对事物进行合理推断和判断的过程。它涉及到对常见事物、现象和规律的理解,通过综合分析得出合理的结论。 | ```"question": "美即好效应,指对一个外表英俊漂亮的人,人们很容易误认为他或她的其他方面也很不错。根据上述定义,下列哪项属于美即好效应?( )", "A": "外表英俊漂亮的人在应聘中更受招聘者的青睐", "B": "小芳认为自己的女儿是幼儿园中最漂亮的孩子", "C": "人们常说女孩因为可爱而美丽并非因为美丽而可爱", "D": "购物网站上有一个漂亮的模特往往会提高产品的销量"``` |
|
| 38 |
+
| 数学 | 初等数学 | 初等教育数学能力(小学数学) | ```"question": "小芳手上有40元。她的爸爸又给了她100元。她花了30元买了一条牛仔裤,又花了20元买了一个包。那么小芳还剩下多少钱呢?"```|
|
| 39 |
+
| 数学 | 中等数学 | 中等教育数学能力(初中和高中数学) | ```"question": "某地开展建设绿色家园活动,活动期间,计划每天种植相同数量的树木.该活动开始后,实际每天比原计划每天多植树$50$棵,实际植树$400$棵所需时间与原计划植树$300$棵所需时间相同.设实际每天植树$x$棵,则下列方程正确的是( )", "options": ["$\\frac{{400}}{{x-50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x-50}}=\\frac{{400}}{x}$", "$\\frac{{400}}{{x+50}}=\\frac{{300}}{x}$", "$\\frac{{300}}{{x+50}}=\\frac{{400}}{x}$"]```|
|
| 40 |
+
| 数学 | 高等 | 高教育数学能力(大学和研究生数学) | ```"question": "已知有向曲线 $L$ 为球面 $x^2+y^2+z^2=2x$ 与平面 $2x-z-1=0$ 的交线,从 $z$ 轴正向往 $z$ 轴负向看去为逆时针方向,计算曲线积分$\\int_L(6xyz-yz^2)dx+2x^2zdy+xyzdz$.", "options": [ "$\\frac{4\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{7\\sqrt5}$", "$\\frac{3\\pi}{5\\sqrt5}$", "$\\frac{4\\pi}{5\\sqrt5}$"]``` |
|
| 41 |
+
| 代码 | 代码理解 | 输入为用户的需求文字或者部分代码,考察模型的逻辑推理能力和代码生成能力,考察模型对各类编程语言的掌握程度。内容包括不限于:算法和数据结构能力考察编程语言语法考察跨编程语言转换 | ```"question": "编写一个 Python 函数,用于检查两个数字是否仅在一个位置上不同。"```|
|
| 42 |
+
| 代码 | 代码分析 | 考察模型对代码的理解和分析能力,给定一段代码,进行代码意图分析,代码规范检查,错误检查等 | ```"question":"\n\ndef truncate_number(number: float) -> float:\n \"\"\" 给定一个正的浮点数,可以将其分解为整数部分(小于给定数字的最大整数)和小数部分(余数部分总是小于1)。\n\n 返回该数字的小数部分。\n >>> truncate_number(3.5)\n 0.5\n \"\"\"",``` |
|
| 43 |
+
| 长文本 | 长文本理解与推理 | 考察模型在不同的长度上下文(2k, 4k, 8k, 16k, 32k)情况下的理解和推理能力 | 略 |
|
| 44 |
+
| 智能体 | 任务规划 | 智能体根据用户的需求目标和具备工具条件,进行合理的任务拆解,科学地安排子任务的执行顺序和策略,对任务执行路径进行设计和规划,选择合适的策略。 | 略|
|
| 45 |
+
| 智能体 | 工具调用 | 评估模型能否准确的调用合适的API,在调用API时能否正确的传递参数 | 略 |
|
| 46 |
+
| 智能体 | 反思能力 | 评估模型在子任务执行失败时,是否具有反思和重新规划任务路径的能力 | 略 |
|
| 47 |
+
| 智能体 | 任务执行总结 | 评估模型能否根据子任务的执行结果进行总结分析,完成原始任务目标,正确地按指令输出回复 | 略|
|
| 48 |
+
| 智能体 | 多轮交互 | 评估模型在进行多轮复杂工具调用时的能力,在多轮情况下能否准确理解意图 | 略 |
|
opencompass/docs/zh_cn/advanced_guides/contamination_eval.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 数据污染评估
|
| 2 |
+
|
| 3 |
+
**数据污染** 是指本应用在下游测试任务重的数据出现在了大语言模型 (LLM) 的训练数据中,从而导致在下游任务 (例如,摘要、自然语言推理、文本分类) 上指标虚高,无法反映模型真实泛化能力的现象。
|
| 4 |
+
|
| 5 |
+
由于数据污染的源头是出现在 LLM 所用的训练数据中,因此最直接的检测数据污染的方法就是将测试数据与训练数据进行碰撞,然后汇报两者之间有多少语料是重叠出现的,经典的 GPT-3 [论文](https://arxiv.org/pdf/2005.14165.pdf)中的表 C.1 会报告了相关内容。
|
| 6 |
+
|
| 7 |
+
但如今开源社区往往只会公开模型参数而非训练数据集,在此种情况下 如何判断是否存在数据污染问题或污染程度如何,这些问题还没有被广泛接受的解决方案。OpenCompass 提供了两种可能的解决方案。
|
| 8 |
+
|
| 9 |
+
## 基于自建同分布数据的污染数据标注
|
| 10 |
+
|
| 11 |
+
我们参考了 [Skywork](https://arxiv.org/pdf/2310.19341.pdf) 中 5.2 节提到的方法,直接使用了 Skywork 上传到 HuggingFace 上的数据集 [mock_gsm8k_test](https://huggingface.co/datasets/Skywork/mock_gsm8k_test)。
|
| 12 |
+
|
| 13 |
+
在该方法中,作者使用 GPT-4 合成了一批与原始 GSM8K 风格类似的数据,然后使用模型分别计算在 GSM8K 训练集 (train),GSM8K 测试集 (test),GSM8K 参考集 (ref) 上的困惑度。由于 GSM8K 参考集是最新生成的,作者认为它必然不属于任何模型的任何训练集中,即它是干净的。作者认为:
|
| 14 |
+
|
| 15 |
+
- 若 测试集 的困惑度远小于 参考集 的困惑度,那么 测试集 可能出现在了模型的训练阶段;
|
| 16 |
+
- 若 训练集 的困惑度远小于 测试集 的困惑度,那么 训练集 可能被模型过拟合了。
|
| 17 |
+
|
| 18 |
+
我们可以参考使用以下配置文件:
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
from mmengine.config import read_base
|
| 22 |
+
|
| 23 |
+
with read_base():
|
| 24 |
+
from .datasets.gsm8k_contamination.gsm8k_contamination_ppl_ecdd22 import gsm8k_datasets # 包含训练、测试、参考集
|
| 25 |
+
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model # 待审查的模型
|
| 26 |
+
from .models.yi.hf_yi_6b import models as hf_yi_6b_model
|
| 27 |
+
|
| 28 |
+
datasets = [*gsm8k_datasets]
|
| 29 |
+
models = [*hf_qwen_7b_model, *hf_yi_6b_model]
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
其样例输出如下:
|
| 33 |
+
|
| 34 |
+
```text
|
| 35 |
+
dataset version metric mode internlm-7b-hf qwen-7b-hf yi-6b-hf chatglm3-6b-base-hf qwen-14b-hf baichuan2-13b-base-hf internlm-20b-hf aquila2-34b-hf ...
|
| 36 |
+
--------------- --------- ----------- ------- ---------------- ------------ ---------- --------------------- ------------- ----------------------- ----------------- ---------------- ...
|
| 37 |
+
gsm8k-train-ppl 0b8e46 average_ppl unknown 1.5 0.78 1.37 1.16 0.5 0.76 1.41 0.78 ...
|
| 38 |
+
gsm8k-test-ppl 0b8e46 average_ppl unknown 1.56 1.33 1.42 1.3 1.15 1.13 1.52 1.16 ...
|
| 39 |
+
gsm8k-ref-ppl f729ba average_ppl unknown 1.55 1.2 1.43 1.35 1.27 1.19 1.47 1.35 ...
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
目前该方案仅支持 GSM8K 数据集,我们欢迎社区贡献更多的数据集。
|
| 43 |
+
|
| 44 |
+
如果使用了该方法,请添加引用:
|
| 45 |
+
|
| 46 |
+
```bibtex
|
| 47 |
+
@misc{2023opencompass,
|
| 48 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 49 |
+
author={OpenCompass Contributors},
|
| 50 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
| 51 |
+
year={2023}
|
| 52 |
+
}
|
| 53 |
+
@misc{wei2023skywork,
|
| 54 |
+
title={Skywork: A More Open Bilingual Foundation Model},
|
| 55 |
+
author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
|
| 56 |
+
year={2023},
|
| 57 |
+
eprint={2310.19341},
|
| 58 |
+
archivePrefix={arXiv},
|
| 59 |
+
primaryClass={cs.CL}
|
| 60 |
+
}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## 基于经典预训练集的污染数据标注
|
| 64 |
+
|
| 65 |
+
感谢 [Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) 以及 @liyucheng09 提供了本方法。
|
| 66 |
+
|
| 67 |
+
在该方法中,作者将测试数据集 (例如 C-Eval, ARC, HellaSwag 等) 使用 Common Crawl 数据库和 Bing 搜索引擎来进行检索,然后依次标记每条测试样本是 干净的 / 题目被污染的 / 题目和答案均被污染的。
|
| 68 |
+
|
| 69 |
+
测试时,OpenCompass 会分别汇报 ceval 在三种标签所组成的子集上的准确率或困惑度。一般来说,准确率从低到高依次是 干净的,题目被污染的,题目和答案均被污染的 子集。作者认为:
|
| 70 |
+
|
| 71 |
+
- 若三者性能较为接近,则模型在该测试集上的污染程度较轻;反之则污染程度较重。
|
| 72 |
+
|
| 73 |
+
我们可以参考使用以下配置文件 [link](https://github.com/open-compass/opencompass/blob/main/configs/eval_contamination.py):
|
| 74 |
+
|
| 75 |
+
```python
|
| 76 |
+
from mmengine.config import read_base
|
| 77 |
+
|
| 78 |
+
with read_base():
|
| 79 |
+
from .datasets.ceval.ceval_clean_ppl import ceval_datasets # 有污染标记的 ceval 数据集
|
| 80 |
+
from .models.yi.hf_yi_6b import models as hf_yi_6b_model # 待审查的模型
|
| 81 |
+
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_model
|
| 82 |
+
from .summarizers.contamination import ceval_summarizer as summarizer # 输出格式整理
|
| 83 |
+
|
| 84 |
+
datasets = [*ceval_datasets]
|
| 85 |
+
models = [*hf_yi_6b_model, *hf_qwen_7b_model]
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
其样例输出如下:
|
| 89 |
+
|
| 90 |
+
```text
|
| 91 |
+
dataset version mode yi-6b-hf - - qwen-7b-hf - - ...
|
| 92 |
+
---------------------------------------------- --------- ------ ---------------- ----------------------------- --------------------------------------- ---------------- ----------------------------- --------------------------------------- ...
|
| 93 |
+
- - - accuracy - clean accuracy - input contaminated accuracy - input-and-label contaminated accuracy - clean accuracy - input contaminated accuracy - input-and-label contaminated ...
|
| 94 |
+
...
|
| 95 |
+
ceval-humanities - ppl 74.42 75.00 82.14 67.44 50.00 70.54 ...
|
| 96 |
+
ceval-stem - ppl 53.70 57.14 85.61 47.41 52.38 67.63 ...
|
| 97 |
+
ceval-social-science - ppl 81.60 84.62 83.09 76.00 61.54 72.79 ...
|
| 98 |
+
ceval-other - ppl 72.31 73.91 75.00 58.46 39.13 61.88 ...
|
| 99 |
+
ceval-hard - ppl 44.35 37.50 70.00 41.13 25.00 30.00 ...
|
| 100 |
+
ceval - ppl 67.32 71.01 81.17 58.97 49.28 67.82 ...
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
目前该方案仅支持 C-Eval, MMLU, HellaSwag 和 ARC 数据集,[Contamination_Detector](https://github.com/liyucheng09/Contamination_Detector) 中还包含了 CSQA 和 WinoGrande,但目前还没有在 OpenCompass 中实现。我们欢迎社区贡献更多的数据集。
|
| 104 |
+
|
| 105 |
+
如果使用了该方法,请添加引用:
|
| 106 |
+
|
| 107 |
+
```bibtex
|
| 108 |
+
@misc{2023opencompass,
|
| 109 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 110 |
+
author={OpenCompass Contributors},
|
| 111 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
| 112 |
+
year={2023}
|
| 113 |
+
}
|
| 114 |
+
@article{Li2023AnOS,
|
| 115 |
+
title={An Open Source Data Contamination Report for Llama Series Models},
|
| 116 |
+
author={Yucheng Li},
|
| 117 |
+
journal={ArXiv},
|
| 118 |
+
year={2023},
|
| 119 |
+
volume={abs/2310.17589},
|
| 120 |
+
url={https://api.semanticscholar.org/CorpusID:264490711}
|
| 121 |
+
}
|
| 122 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/custom_dataset.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 自定义数据集
|
| 2 |
+
|
| 3 |
+
本教程仅供临时性的、非正式的数据集使用,如果所用数据集需要长期使用,或者存在定制化读取 / 推理 / 评测需求的,强烈建议按照 [new_dataset.md](./new_dataset.md) 中介绍的方法进行实现。
|
| 4 |
+
|
| 5 |
+
在本教程中,我们将会介绍如何在不实现 config,不修改 OpenCompass 源码的情况下,对一新增数据集进行测试的方法。我们支持的任务类型包括选择 (`mcq`) 和问答 (`qa`) 两种,其中 `mcq` 支持 `ppl` 推理和 `gen` 推理;`qa` 支持 `gen` 推理。
|
| 6 |
+
|
| 7 |
+
## 数据集格式
|
| 8 |
+
|
| 9 |
+
我们支持 `.jsonl` 和 `.csv` 两种格式的数据集。
|
| 10 |
+
|
| 11 |
+
### 选择题 (`mcq`)
|
| 12 |
+
|
| 13 |
+
对于选择 (`mcq`) 类型的数据,默认的字段如下:
|
| 14 |
+
|
| 15 |
+
- `question`: 表示选择题的题干
|
| 16 |
+
- `A`, `B`, `C`, ...: 使用单个大写字母表示选项,个数不限定。默认只会从 `A` 开始,解析连续的字母作为选项。
|
| 17 |
+
- `answer`: 表示选择题的正确答案,其值必须是上述所选用的选项之一,如 `A`, `B`, `C` 等。
|
| 18 |
+
|
| 19 |
+
对于非默认字段,我们都会进行读入,但默认不会使用。如需使用,则需要在 `.meta.json` 文件中进行指定。
|
| 20 |
+
|
| 21 |
+
`.jsonl` 格式样例如下:
|
| 22 |
+
|
| 23 |
+
```jsonl
|
| 24 |
+
{"question": "165+833+650+615=", "A": "2258", "B": "2263", "C": "2281", "answer": "B"}
|
| 25 |
+
{"question": "368+959+918+653+978=", "A": "3876", "B": "3878", "C": "3880", "answer": "A"}
|
| 26 |
+
{"question": "776+208+589+882+571+996+515+726=", "A": "5213", "B": "5263", "C": "5383", "answer": "B"}
|
| 27 |
+
{"question": "803+862+815+100+409+758+262+169=", "A": "4098", "B": "4128", "C": "4178", "answer": "C"}
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
`.csv` 格式样例如下:
|
| 31 |
+
|
| 32 |
+
```csv
|
| 33 |
+
question,A,B,C,answer
|
| 34 |
+
127+545+588+620+556+199=,2632,2635,2645,B
|
| 35 |
+
735+603+102+335+605=,2376,2380,2410,B
|
| 36 |
+
506+346+920+451+910+142+659+850=,4766,4774,4784,C
|
| 37 |
+
504+811+870+445=,2615,2630,2750,B
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### 问答题 (`qa`)
|
| 41 |
+
|
| 42 |
+
对于问答 (`qa`) 类型的数据,默认的字段如下:
|
| 43 |
+
|
| 44 |
+
- `question`: 表示问答题的题干
|
| 45 |
+
- `answer`: 表示问答题的正确答案。可缺失,表示该数据集无正确答案。
|
| 46 |
+
|
| 47 |
+
对于非默认字段,我们都会进行读入,但默认不会使用。如需使用,则需要在 `.meta.json` 文件中进行指定。
|
| 48 |
+
|
| 49 |
+
`.jsonl` 格式样例如下:
|
| 50 |
+
|
| 51 |
+
```jsonl
|
| 52 |
+
{"question": "752+361+181+933+235+986=", "answer": "3448"}
|
| 53 |
+
{"question": "712+165+223+711=", "answer": "1811"}
|
| 54 |
+
{"question": "921+975+888+539=", "answer": "3323"}
|
| 55 |
+
{"question": "752+321+388+643+568+982+468+397=", "answer": "4519"}
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
`.csv` 格式样例如下:
|
| 59 |
+
|
| 60 |
+
```csv
|
| 61 |
+
question,answer
|
| 62 |
+
123+147+874+850+915+163+291+604=,3967
|
| 63 |
+
149+646+241+898+822+386=,3142
|
| 64 |
+
332+424+582+962+735+798+653+214=,4700
|
| 65 |
+
649+215+412+495+220+738+989+452=,4170
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## 命令行列表
|
| 69 |
+
|
| 70 |
+
自定义数据集可直接通过命令行来调用开始评测。
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
python run.py \
|
| 74 |
+
--models hf_llama2_7b \
|
| 75 |
+
--custom-dataset-path xxx/test_mcq.csv \
|
| 76 |
+
--custom-dataset-data-type mcq \
|
| 77 |
+
--custom-dataset-infer-method ppl
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
python run.py \
|
| 82 |
+
--models hf_llama2_7b \
|
| 83 |
+
--custom-dataset-path xxx/test_qa.jsonl \
|
| 84 |
+
--custom-dataset-data-type qa \
|
| 85 |
+
--custom-dataset-infer-method gen
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
在绝大多数情况下,`--custom-dataset-data-type` 和 `--custom-dataset-infer-method` 可以省略,OpenCompass 会根据以下逻辑进行设置:
|
| 89 |
+
|
| 90 |
+
- 如果从数据集文件中可以解析出选项,如 `A`, `B`, `C` 等,则认定该数据集为 `mcq`,否则认定为 `qa`。
|
| 91 |
+
- 默认 `infer_method` 为 `gen`。
|
| 92 |
+
|
| 93 |
+
## 配置文件
|
| 94 |
+
|
| 95 |
+
在原配置文件中,直接向 `datasets` 变量中添加新的项即可即可。自定义数据集亦可与普通数据集混用。
|
| 96 |
+
|
| 97 |
+
```python
|
| 98 |
+
datasets = [
|
| 99 |
+
{"path": "xxx/test_mcq.csv", "data_type": "mcq", "infer_method": "ppl"},
|
| 100 |
+
{"path": "xxx/test_qa.jsonl", "data_type": "qa", "infer_method": "gen"},
|
| 101 |
+
]
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
## 数据集补充信息 `.meta.json`
|
| 105 |
+
|
| 106 |
+
OpenCompass 会默认尝试对输入的数据集文件进行解析,因此在绝大多数情况下,`.meta.json` 文件都是 **不需要** 的。但是,如果数据集的字段名不是默认的字段名,或者需要自定义提示词,则需要在 `.meta.json` 文件中进行指定。
|
| 107 |
+
|
| 108 |
+
我们会在数据集同级目录下,以文件名+`.meta.json` 的形式放置一个表征数据集使用方法的文件,样例文件结构如下:
|
| 109 |
+
|
| 110 |
+
```tree
|
| 111 |
+
.
|
| 112 |
+
├── test_mcq.csv
|
| 113 |
+
├── test_mcq.csv.meta.json
|
| 114 |
+
├── test_qa.jsonl
|
| 115 |
+
└── test_qa.jsonl.meta.json
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
该文件可能字段如下:
|
| 119 |
+
|
| 120 |
+
- `abbr` (str): 数据集缩写,作为该数据集的 ID。
|
| 121 |
+
- `data_type` (str): 数据集类型,可选值为 `mcq` 和 `qa`.
|
| 122 |
+
- `infer_method` (str): 推理方法,可选值为 `ppl` 和 `gen`.
|
| 123 |
+
- `human_prompt` (str): 用户提示词模板,用于生成提示词。模板中的变量使用 `{}` 包裹,如 `{question}`,`{opt1}` 等。如存在 `template`,则该字段会被忽略。
|
| 124 |
+
- `bot_prompt` (str): 机器人提示词模板,用于生成提示词。模板中的变量使用 `{}` 包裹,如 `{answer}` 等。如存在 `template`,则该字段会被忽略。
|
| 125 |
+
- `template` (str or dict): 问题模板,用于生成提示词。模板中的变量使用 `{}` 包裹,如 `{question}`,`{opt1}` 等。相关语法见[此处](../prompt/prompt_template.md) 关于 `infer_cfg['prompt_template']['template']` 的内容。
|
| 126 |
+
- `input_columns` (list): 输入字段列表,用于读入数据。
|
| 127 |
+
- `output_column` (str): 输出字段,用于读入数据。
|
| 128 |
+
- `options` (list): 选项列表,用于读入数据,仅在 `data_type` 为 `mcq` 时有效。
|
| 129 |
+
|
| 130 |
+
样例如下:
|
| 131 |
+
|
| 132 |
+
```json
|
| 133 |
+
{
|
| 134 |
+
"human_prompt": "Question: 127 + 545 + 588 + 620 + 556 + 199 =\nA. 2632\nB. 2635\nC. 2645\nAnswer: Let's think step by step, 127 + 545 + 588 + 620 + 556 + 199 = 672 + 588 + 620 + 556 + 199 = 1260 + 620 + 556 + 199 = 1880 + 556 + 199 = 2436 + 199 = 2635. So the answer is B.\nQuestion: {question}\nA. {A}\nB. {B}\nC. {C}\nAnswer: ",
|
| 135 |
+
"bot_prompt": "{answer}"
|
| 136 |
+
}
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
或者
|
| 140 |
+
|
| 141 |
+
```json
|
| 142 |
+
{
|
| 143 |
+
"template": "Question: {my_question}\nX. {X}\nY. {Y}\nZ. {Z}\nW. {W}\nAnswer:",
|
| 144 |
+
"input_columns": ["my_question", "X", "Y", "Z", "W"],
|
| 145 |
+
"output_column": "my_answer",
|
| 146 |
+
}
|
| 147 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/evaluation_lightllm.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 评测 Lightllm 模型
|
| 2 |
+
|
| 3 |
+
我们支持评测使用 [Lightllm](https://github.com/ModelTC/lightllm) 进行推理的大语言模型。Lightllm 是由商汤科技开发,是一个基于 Python 的 LLM 推理和服务框架,以其轻量级设计、易于扩展和高速性能而著称,Lightllm 对多种大模型都进行了支持。用户可以通过 Lightllm 进行模型推理,并且以服务的形式在本地起起来,在评测过程中,OpenCompass 通过 api 将数据喂给Lightllm,并对返回的结果进行处理。OpenCompass 对 Lightllm 进行了适配,本教程将介绍如何使用 OpenCompass 来对以 Lightllm 作为推理后端的模型进行评测。
|
| 4 |
+
|
| 5 |
+
## 环境配置
|
| 6 |
+
|
| 7 |
+
### 安装 OpenCompass
|
| 8 |
+
|
| 9 |
+
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
|
| 10 |
+
|
| 11 |
+
### 安装 Lightllm
|
| 12 |
+
|
| 13 |
+
请根据 [Lightllm 主页](https://github.com/ModelTC/lightllm) 来安装 Lightllm。注意对齐相关依赖库的版本,尤其是 transformers 的版本。
|
| 14 |
+
|
| 15 |
+
## 评测
|
| 16 |
+
|
| 17 |
+
我们以 llama2-7B 评测 humaneval 作为例子来介绍如何评测。
|
| 18 |
+
|
| 19 |
+
### 第一步: 将模型通过 Lightllm 在本地以服务的形式起起来
|
| 20 |
+
|
| 21 |
+
```shell
|
| 22 |
+
python -m lightllm.server.api_server --model_dir /path/llama2-7B \
|
| 23 |
+
--host 0.0.0.0 \
|
| 24 |
+
--port 1030 \
|
| 25 |
+
--nccl_port 2066 \
|
| 26 |
+
--max_req_input_len 4096 \
|
| 27 |
+
--max_req_total_len 6144 \
|
| 28 |
+
--tp 1 \
|
| 29 |
+
--trust_remote_code \
|
| 30 |
+
--max_total_token_num 120000
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**注:** 上述命令可以通过 tp 的数量设置,在 tp 张卡上进行 TensorParallel 推理,适用于较大的模型的推理。
|
| 34 |
+
|
| 35 |
+
**注:** 上述命令中的 max_total_token_num,会影响测试过程中的吞吐性能,可以根据 [Lightllm 主页](https://github.com/ModelTC/lightllm) 上的文档,进行设置。只要不爆显存,往往设置越大越好。
|
| 36 |
+
|
| 37 |
+
**注:** 如果要在同一个机器上起多个 Lightllm 服务,需要重新设定上面的 port 和 nccl_port。
|
| 38 |
+
|
| 39 |
+
可以使用下面的 Python 脚本简单测试一下当前服务是否已经起成功
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
import time
|
| 43 |
+
import requests
|
| 44 |
+
import json
|
| 45 |
+
|
| 46 |
+
url = 'http://localhost:8080/generate'
|
| 47 |
+
headers = {'Content-Type': 'application/json'}
|
| 48 |
+
data = {
|
| 49 |
+
'inputs': 'What is AI?',
|
| 50 |
+
"parameters": {
|
| 51 |
+
'do_sample': False,
|
| 52 |
+
'ignore_eos': False,
|
| 53 |
+
'max_new_tokens': 1024,
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
response = requests.post(url, headers=headers, data=json.dumps(data))
|
| 57 |
+
if response.status_code == 200:
|
| 58 |
+
print(response.json())
|
| 59 |
+
else:
|
| 60 |
+
print('Error:', response.status_code, response.text)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### 第二步: 使用 OpenCompass 评测上述模型
|
| 64 |
+
|
| 65 |
+
```shell
|
| 66 |
+
python run.py configs/eval_lightllm.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
当模型完成推理和指标计算后,我们便可获得模型的评测结果。
|
| 70 |
+
|
| 71 |
+
**注:** `eval_lightllm.py` 中,配置的 url 要和上一步服务地址对齐。
|
opencompass/docs/zh_cn/advanced_guides/evaluation_lmdeploy.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 使用 LMDeploy 加速评测
|
| 2 |
+
|
| 3 |
+
我们支持在评测大语言模型时,使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 作为推理加速引擎。LMDeploy 是涵盖了 LLM 和 VLM 任务的全套轻量化、部署和服务解决方案,拥有卓越的推理性能。本教程将介绍如何使用 LMDeploy 加速对模型的评测。
|
| 4 |
+
|
| 5 |
+
## 环境配置
|
| 6 |
+
|
| 7 |
+
### 安装 OpenCompass
|
| 8 |
+
|
| 9 |
+
请根据 OpenCompass [安装指南](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) 来安装算法库和准备数据集。
|
| 10 |
+
|
| 11 |
+
### 安装 LMDeploy
|
| 12 |
+
|
| 13 |
+
使用 pip 安装 LMDeploy (python 3.8+):
|
| 14 |
+
|
| 15 |
+
```shell
|
| 16 |
+
pip install lmdeploy
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
LMDeploy 预编译包默认基于 CUDA 12 编译。如果需要在 CUDA 11+ 下安装 LMDeploy,请执行以下命令:
|
| 20 |
+
|
| 21 |
+
```shell
|
| 22 |
+
export LMDEPLOY_VERSION=0.6.0
|
| 23 |
+
export PYTHON_VERSION=310
|
| 24 |
+
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## 评测
|
| 28 |
+
|
| 29 |
+
在评测一个模型时,需要准备一份评测配置,指明评测集、模型和推理参数等信息。
|
| 30 |
+
|
| 31 |
+
以 [internlm2-chat-7b](https://huggingface.co/internlm/internlm2-chat-7b) 模型为例,相关的配置信息如下:
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
# configure the dataset
|
| 35 |
+
from mmengine.config import read_base
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
with read_base():
|
| 39 |
+
# choose a list of datasets
|
| 40 |
+
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
| 41 |
+
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
|
| 42 |
+
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
| 43 |
+
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \
|
| 44 |
+
gsm8k_datasets
|
| 45 |
+
# and output the results in a chosen format
|
| 46 |
+
from .summarizers.medium import summarizer
|
| 47 |
+
|
| 48 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
| 49 |
+
|
| 50 |
+
# configure lmdeploy
|
| 51 |
+
from opencompass.models import TurboMindModelwithChatTemplate
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# configure the model
|
| 56 |
+
models = [
|
| 57 |
+
dict(
|
| 58 |
+
type=TurboMindModelwithChatTemplate,
|
| 59 |
+
abbr=f'internlm2-chat-7b-lmdeploy',
|
| 60 |
+
# model path, which can be the address of a model repository on the Hugging Face Hub or a local path
|
| 61 |
+
path='internlm/internlm2-chat-7b',
|
| 62 |
+
# inference backend of LMDeploy. It can be either 'turbomind' or 'pytorch'.
|
| 63 |
+
# If the model is not supported by 'turbomind', it will fallback to
|
| 64 |
+
# 'pytorch'
|
| 65 |
+
backend='turbomind',
|
| 66 |
+
# For the detailed engine config and generation config, please refer to
|
| 67 |
+
# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/messages.py
|
| 68 |
+
engine_config=dict(tp=1),
|
| 69 |
+
gen_config=dict(do_sample=False),
|
| 70 |
+
# the max size of the context window
|
| 71 |
+
max_seq_len=7168,
|
| 72 |
+
# the max number of new tokens
|
| 73 |
+
max_out_len=1024,
|
| 74 |
+
# the max number of prompts that LMDeploy receives
|
| 75 |
+
# in `generate` function
|
| 76 |
+
batch_size=5000,
|
| 77 |
+
run_cfg=dict(num_gpus=1),
|
| 78 |
+
)
|
| 79 |
+
]
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
把上述配置放在文件中,比如 "configs/eval_internlm2_lmdeploy.py"。然后,在 OpenCompass 的项目目录下,执行如下命令可得到评测结果:
|
| 83 |
+
|
| 84 |
+
```shell
|
| 85 |
+
python run.py configs/eval_internlm2_lmdeploy.py -w outputs
|
| 86 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/longeval.md
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 长文本评测指引
|
| 2 |
+
|
| 3 |
+
## 介绍
|
| 4 |
+
|
| 5 |
+
虽然大语言模型(LLM)如GPT-4在处理自然语言任务已经展现出明显的优势,但目前的开源模型大多只能处理数千个token长度以内的文本,这限制了模型阅读书籍、撰写文本摘要等需要处理长文本的能力。为了探究模型在应对长文本能力时的表现,我们采用[L-Eval](https://github.com/OpenLMLab/LEval)和[LongBench](https://github.com/THUDM/LongBench)两个长文本数据集来测试模型长文本能力。
|
| 6 |
+
|
| 7 |
+
## 现有算法及模型
|
| 8 |
+
|
| 9 |
+
在处理长文本输入时,推理时间开销和灾难性遗忘是大模型面临的两大主要挑战。最近,大量研究致力于扩展模型长度,这些研究集中于以下三个改进方向。
|
| 10 |
+
|
| 11 |
+
- 注意力机制。这些方法的最终目的多为减少query-key对的计算开销,但可能对下游任务的效果产生影响。
|
| 12 |
+
- 输入方法。部分研究将长文本输入分块或将部分已有文本段重复输入模型以增强模型处理长文本能力,但这些方法只对部分任务有效,难以适应多种下游任务。
|
| 13 |
+
- 位置编码。这部分研究包括RoPE, ALiBi,位置插值等,在长度外推方面展现出了良好的效果。这些方法已经被用于训练如ChatGLM2-6b-32k和LongChat-32k等长文本模型。
|
| 14 |
+
|
| 15 |
+
首先,我们介绍一些流行的位置编码算法。
|
| 16 |
+
|
| 17 |
+
### RoPE
|
| 18 |
+
|
| 19 |
+
RoPE是一种在Transformer中注入位置信息的位置嵌入方法。它使用旋转矩阵对绝对位置进行编码,并同时在自注意力公式中融入显式的相对位置依赖关系。下图是RoPE机制的一个示例。
|
| 20 |
+
|
| 21 |
+
<div align="center">
|
| 22 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/08c57958-0dcb-40d7-b91b-33f20ca2d89f>
|
| 23 |
+
</div>
|
| 24 |
+
|
| 25 |
+
RoPE具有一些有价值的特性,例如可以扩展到任意序列长度、随着相对距离增加而减弱的token间依赖关系以及为线性自注意力提供相对位置编码的能力。
|
| 26 |
+
|
| 27 |
+
RoPE被应用于许多LLM模型,包括LLaMA、LLaMA 2和Vicuna-7b-v1.5-16k。
|
| 28 |
+
|
| 29 |
+
### ALiBi
|
| 30 |
+
|
| 31 |
+
尽管RoPE和其他替代原始位置编码的方法(如T5 bias)改善了外推能力,但它们的速度比原始方法慢得多,并且使用了额外的内存和参数。因此,作者引入了具有线性偏置的注意力(ALiBi)来促进高效的外推。
|
| 32 |
+
|
| 33 |
+
对于长度为L的输入子序列,注意力子层在每个head中计算第i个query
|
| 34 |
+
|
| 35 |
+
```{math}
|
| 36 |
+
q_{i} \in R^{1 \times d}, (1 \leq i \leq L)
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
的注意力分数,给定前i个键
|
| 40 |
+
|
| 41 |
+
```{math}
|
| 42 |
+
K \in R^{i \times d}
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
其中d是head维度。
|
| 46 |
+
|
| 47 |
+
```{math}
|
| 48 |
+
softmax(q_{i}K^{T})
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
ALiBi通过与相关key和query之间的距离成比例的线性递减惩罚来负向偏置注意力分数。它唯一的修改是在query-key点积之后,在其中添加了一个静态的、非学习的偏置。
|
| 52 |
+
|
| 53 |
+
```{math}
|
| 54 |
+
softmax(q_{i}K^{T}+m\cdot[-(i-1),...,-2,-1,0])
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
其中m是在训练之前固定的head特定的斜率。
|
| 58 |
+
|
| 59 |
+
ALiBi去除了位置嵌入部分,它与原始位置编码方法一样快。它被用于包括mpt-7b-storywriter在内的大语言模型,该模型能够处理非常长的输入。
|
| 60 |
+
|
| 61 |
+
### 位置插值(PI)
|
| 62 |
+
|
| 63 |
+
许多现有的预训练LLM模型包括LLaMA,使用具有弱外推性质(例如RoPE)的位置编码。作者提出了一种位置插值方法,它可以轻松地实现非常长的上下文窗口,同时相对保持模型在其原始上下文窗口大小内的处理质量。
|
| 64 |
+
|
| 65 |
+
位置插值的关键思想是直接缩小位置索引,使得最大位置索引与预训练阶段的先前上下文窗口限制相匹配。换句话说,为了容纳更多的输入token,该算法在相邻的整数位置插值位置编码,利用位置编码可以应用于非整数位置的优势,它不需要在训练位置之外进行外推从而导致灾难性值的出现。该算法只需要很少的微调时间,模型就能完全适应大大扩展的上下文窗口。
|
| 66 |
+
|
| 67 |
+
下图展现了位置插值方法的机制。图中左下方说明了位置插值方法,它将位置索引(蓝色和绿色点)本身从\[0, 4096\]缩小到\[0, 2048\],从而使它们位于预训练范围内。
|
| 68 |
+
|
| 69 |
+
<div align="center">
|
| 70 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/406454ba-a811-4c66-abbe-3a5528947257>
|
| 71 |
+
</div>
|
| 72 |
+
|
| 73 |
+
位置插值使得基于ChatGLM2-6B的ChatGLM2-6B-32k模型能够处理32k的上下文窗口大小。
|
| 74 |
+
|
| 75 |
+
接下来,我们将介绍一些我们纳入评测范围的模型。
|
| 76 |
+
|
| 77 |
+
### XGen-7B-8k
|
| 78 |
+
|
| 79 |
+
XGen-7B-8k是使用标准的注意力机制训练的,训练文本最长为8k,总计1.5T个token。为了减少训练时间开销, XGen-7B-8k在不同阶段逐步增加输入文本长度。首先, 模型在序列长度为2k的文本上训练总计800B的token, 随后在长度为4k的文本上训练总计400B的token, 最后, 在长度为8k的文本上训练总计300B的token。
|
| 80 |
+
|
| 81 |
+
### Vicuna-7b-v1.5-16k
|
| 82 |
+
|
| 83 |
+
Vicuna-7b-v1.5-16k是从LLaMA 2微调而来的,它使用了有监督指导微调和线性RoPE扩展方法。训练数据量约为125K个��话,这些对话是从ShareGPT收集而来的。ShareGPT是一个用户可以分享他们与ChatGPT对话的网站。这些对话被打包成每个包含16k个token的序列。
|
| 84 |
+
|
| 85 |
+
### LongChat-7b-v1.5-32k
|
| 86 |
+
|
| 87 |
+
LongChat-7b-v1.5-32k也是从LLaMA 2模型微调得到, LLaMA 2模型最初使用4k的上下文长度进行预训练。LongChat-7b-v1.5-32k的第一步是压缩RoPE。由于LLaMA 2模型在预训练阶段没有训练输入位置大于4096的token,LongChat将位置大于4096的token压缩到0到4096之间。第二步是在对话数据上微调LongChat模型。在这一步中,LongChat使用FastChat中的步骤对数据进行清洗,并将对话文本截断到模型的最大长度。
|
| 88 |
+
|
| 89 |
+
### ChatGLM2-6B-32k
|
| 90 |
+
|
| 91 |
+
ChatGLM2-6B-32k进一步增强了ChatGLM2-6B的长文本能力。它采用位置插值方法,在对话对齐过程中使用32k上下文长度进行训练,因此ChatGLM2-6B-32k能够更好地处理长达32K的上下文长度。
|
| 92 |
+
|
| 93 |
+
## [L-Eval](https://github.com/OpenLMLab/LEval)
|
| 94 |
+
|
| 95 |
+
L-Eval是由OpenLMLab构建的一个长文本数据集,由18个子任务组成,其中包含法律、经济、科技等各个领域的文本。数据集总计411篇文档,超过2000条测例,文档平均长度为7217词。该数据集将子任务划分为close-ended和open-ended两类,5个close-ended任务使用完全匹配(Exact Match)作为评测标准,而13个open-ended任务则使用Rouge分数评测。
|
| 96 |
+
|
| 97 |
+
## [LongBench](https://github.com/THUDM/LongBench)
|
| 98 |
+
|
| 99 |
+
LongBench是由THUDM构建的长文本数据集,由21个子任务构成,总计4750条测例。该数据集是第一个包含中英双语的长文本数据集,其中英语文本长度平均为6711词,中文文本平均长度为13386字。21个子任务分为以下6种类型,对模型各方面能力提供了较为全面的评测。
|
| 100 |
+
|
| 101 |
+
<div align="center">
|
| 102 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/4555e937-c519-4e9c-ad8d-7370430d466a>
|
| 103 |
+
</div>
|
| 104 |
+
|
| 105 |
+
## 评测方法
|
| 106 |
+
|
| 107 |
+
由于不同模型能够接受的最大输入长度不同,为了更加公平地比较这些大模型,在输入长度超过模型最大输入限制时,我们将裁剪输入文本的中间部分,从而避免提示词缺失的情况。
|
| 108 |
+
|
| 109 |
+
## 长文本能力榜单
|
| 110 |
+
|
| 111 |
+
在LongBench和L-Eval能力榜单中,我们选取各模型在子任务上排名的平均值 **(排名数值越低越好)** 作为标准。可以看到GPT-4和GPT-3.5-turbo-16k在长文本任务中仍然占据领先地位,而例如ChatGLM2-6B-32k在基于ChatGLM2-6B使用位置插值后在长文本能力方面也有明显提升。
|
| 112 |
+
|
| 113 |
+
<div align="center">
|
| 114 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/29b5ad12-d9a3-4255-be0a-f770923fe514>
|
| 115 |
+
<img src=https://github.com/open-compass/opencompass/assets/75252858/680b4cda-c2b1-45d1-8c33-196dee1a38f3>
|
| 116 |
+
</div>
|
| 117 |
+
|
| 118 |
+
原始分数如下所示。
|
| 119 |
+
|
| 120 |
+
| L-Eval | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | vicuna-7b-v1.5-16k | xgen-7b-8k | internlm-chat-7b-8k | longchat-7b-v1.5-32k | chatglm2-6b |
|
| 121 |
+
| ----------------- | ----- | ----------------- | --------------- | ------------------ | ---------- | ------------------- | -------------------- | ----------- |
|
| 122 |
+
| coursera | 61.05 | 50 | 45.35 | 26.74 | 33.72 | 40.12 | 27.91 | 38.95 |
|
| 123 |
+
| gsm100 | 92 | 78 | 27 | 11 | 8 | 19 | 5 | 8 |
|
| 124 |
+
| quality | 81.19 | 62.87 | 44.55 | 11.39 | 33.66 | 45.54 | 29.7 | 41.09 |
|
| 125 |
+
| tpo | 72.93 | 74.72 | 56.51 | 17.47 | 44.61 | 60.59 | 17.1 | 56.51 |
|
| 126 |
+
| topic_retrieval | 100 | 79.33 | 44.67 | 24.67 | 1.33 | 0 | 25.33 | 1.33 |
|
| 127 |
+
| | | | | | | | | |
|
| 128 |
+
| financialqa | 53.49 | 50.32 | 35.41 | 44.59 | 39.28 | 25.09 | 34.07 | 17.82 |
|
| 129 |
+
| gov_report | 50.84 | 50.48 | 42.97 | 48.17 | 38.52 | 31.29 | 36.52 | 41.88 |
|
| 130 |
+
| legal_contract_qa | 31.23 | 27.97 | 34.21 | 24.25 | 21.36 | 19.28 | 13.32 | 17.59 |
|
| 131 |
+
| meeting_summ | 31.44 | 33.54 | 29.13 | 28.52 | 27.96 | 17.56 | 22.32 | 15.98 |
|
| 132 |
+
| multidocqa | 37.81 | 35.84 | 28.6 | 26.88 | 24.41 | 22.43 | 21.85 | 19.66 |
|
| 133 |
+
| narrativeqa | 25.87 | 25.73 | 18.24 | 20.58 | 16.87 | 13.81 | 16.87 | 1.16 |
|
| 134 |
+
| nq | 67.36 | 66.91 | 41.06 | 36.44 | 29.43 | 16.42 | 35.02 | 0.92 |
|
| 135 |
+
| news_summ | 34.52 | 40.41 | 32.72 | 33.98 | 26.87 | 22.48 | 30.33 | 29.51 |
|
| 136 |
+
| paper_assistant | 42.26 | 41.76 | 34.59 | 35.83 | 25.39 | 28.25 | 30.42 | 30.43 |
|
| 137 |
+
| patent_summ | 48.61 | 50.62 | 46.04 | 48.87 | 46.53 | 30.3 | 41.6 | 41.25 |
|
| 138 |
+
| review_summ | 31.98 | 33.37 | 21.88 | 29.21 | 26.85 | 16.61 | 20.02 | 19.68 |
|
| 139 |
+
| scientificqa | 49.76 | 48.32 | 31.27 | 31 | 27.43 | 33.01 | 20.98 | 13.61 |
|
| 140 |
+
| tvshow_summ | 34.84 | 31.36 | 23.97 | 27.88 | 26.6 | 14.55 | 25.09 | 19.45 |
|
| 141 |
+
|
| 142 |
+
| LongBench | GPT-4 | GPT-3.5-turbo-16k | chatglm2-6b-32k | longchat-7b-v1.5-32k | vicuna-7b-v1.5-16k | internlm-chat-7b-8k | chatglm2-6b | xgen-7b-8k |
|
| 143 |
+
| ------------------- | ----- | ----------------- | --------------- | -------------------- | ------------------ | ------------------- | ----------- | ---------- |
|
| 144 |
+
| NarrativeQA | 31.2 | 25.79 | 19.27 | 19.19 | 23.65 | 12.24 | 13.09 | 18.85 |
|
| 145 |
+
| Qasper | 42.77 | 43.4 | 33.93 | 30.36 | 31.45 | 24.81 | 22.52 | 20.18 |
|
| 146 |
+
| MultiFieldQA-en | 55.1 | 54.35 | 45.58 | 44.6 | 43.38 | 25.41 | 38.09 | 37 |
|
| 147 |
+
| MultiFieldQA-zh | 64.4 | 61.92 | 52.94 | 32.35 | 44.65 | 36.13 | 37.67 | 14.7 |
|
| 148 |
+
| | | | | | | | | |
|
| 149 |
+
| HotpotQA | 59.85 | 52.49 | 46.41 | 34.43 | 34.17 | 27.42 | 27.35 | 28.78 |
|
| 150 |
+
| 2WikiMQA | 67.52 | 41.7 | 33.63 | 23.06 | 20.45 | 26.24 | 22.83 | 20.13 |
|
| 151 |
+
| Musique | 37.53 | 27.5 | 21.57 | 12.42 | 13.92 | 9.75 | 7.26 | 11.34 |
|
| 152 |
+
| DuReader (zh) | 38.65 | 29.37 | 38.53 | 20.25 | 20.42 | 11.11 | 17.18 | 8.57 |
|
| 153 |
+
| | | | | | | | | |
|
| 154 |
+
| GovReport | 32.09 | 29.92 | 32.47 | 29.83 | 29.27 | 18.38 | 22.86 | 23.37 |
|
| 155 |
+
| QMSum | 24.37 | 23.67 | 23.19 | 22.71 | 23.37 | 18.45 | 21.23 | 21.12 |
|
| 156 |
+
| Multi_news | 28.52 | 27.05 | 25.12 | 26.1 | 27.83 | 24.52 | 24.7 | 23.69 |
|
| 157 |
+
| VCSUM (zh) | 15.54 | 16.88 | 15.95 | 13.46 | 15.76 | 12.91 | 14.07 | 0.98 |
|
| 158 |
+
| | | | | | | | | |
|
| 159 |
+
| TREC | 78.5 | 73.5 | 30.96 | 29.23 | 32.06 | 39 | 24.46 | 29.31 |
|
| 160 |
+
| TriviaQA | 92.19 | 92.75 | 80.64 | 64.19 | 46.53 | 79.55 | 64.19 | 69.58 |
|
| 161 |
+
| SAMSum | 46.32 | 43.16 | 29.49 | 25.23 | 25.23 | 43.05 | 20.22 | 16.05 |
|
| 162 |
+
| LSHT (zh) | 41.5 | 34.5 | 22.75 | 20 | 24.75 | 20.5 | 16 | 18.67 |
|
| 163 |
+
| | | | | | | | | |
|
| 164 |
+
| Passage Count | 8.5 | 3 | 3 | 1 | 3 | 1.76 | 3 | 1 |
|
| 165 |
+
| PassageRetrieval-en | 75 | 73 | 57.5 | 20.5 | 16.5 | 7 | 5.5 | 12 |
|
| 166 |
+
| PassageRetrieval-zh | 96 | 82.5 | 58 | 15 | 21 | 2.29 | 5 | 3.75 |
|
| 167 |
+
| | | | | | | | | |
|
| 168 |
+
| LCC | 59.25 | 53.49 | 53.3 | 51.46 | 49.3 | 49.32 | 46.59 | 44.1 |
|
| 169 |
+
| RepoBench-P | 55.42 | 55.95 | 46.66 | 52.18 | 41.49 | 35.86 | 41.97 | 41.83 |
|
opencompass/docs/zh_cn/advanced_guides/needleinahaystack_eval.md
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 大海捞针(Needle In A Haystack)实验评估
|
| 2 |
+
|
| 3 |
+
## 大海捞针测试简介
|
| 4 |
+
|
| 5 |
+
大海捞针测试(灵感来自[NeedleInAHaystack](https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/LLMNeedleHaystackTester.py))是一种评估方法,它通过在长文本中随机插入关键信息,形成大型语言模型(LLM)的Prompt。该测试旨在检测大型模型是否能从长文本中提取出这些关键信息,从而评估模型处理长文本信息提取的能力,这可以反映LLM对长文本的理解基础能力。
|
| 6 |
+
|
| 7 |
+
## 任务介绍
|
| 8 |
+
|
| 9 |
+
在`OpenCompass`的`NeedleBench`框架中,为了全面评估模型在长文本信息提取和推理方面的能力,我们设计了一系列逐渐增加难度的测试方案。完整的介绍参见我们的[技术报告](https://arxiv.org/abs/2407.11963)。
|
| 10 |
+
|
| 11 |
+
- **单一信息检索任务(Single-Needle Retrieval Task, S-RT)**:评估LLM在长文本中提取单一关键信息的能力,测试其对广泛叙述中特定细节的精确回忆能力。这对应于**原始的大海捞针测试**任务设定。
|
| 12 |
+
|
| 13 |
+
- **多信息检索任务(Multi-Needle Retrieval Task, M-RT)**:探讨LLM从长文本中检索多个相关信息的能力,模拟实际场景中对综合文档的复杂查询。
|
| 14 |
+
|
| 15 |
+
- **多信息推理任务(Multi-Needle Reasoning Task, M-RS)**:通过提取并利用长文本中的多个关键信息来评估LLM的长文本能力,要求模型对各关键信息片段有综合理解。
|
| 16 |
+
|
| 17 |
+
- **祖先追溯挑战(Ancestral Trace Challenge, ATC)**:通过设计“亲属关系针”,测试LLM处理真实长文本中多层逻辑挑战的能力。在ATC任务中,通过一系列逻辑推理问题,检验模型对长文本中每个细节的记忆和分析能力,在此任务中,我们去掉了无关文本(Haystack)的设定,而是将所有文本设计为关键信息,LLM必须综合运用长文本中的所有内容和推理才能准确回答问题。
|
| 18 |
+
|
| 19 |
+
### 评估步骤
|
| 20 |
+
|
| 21 |
+
> 注意:在最新代码中,OpenCompass已经设置数据集从[Huggingface的接口](https://huggingface.co/datasets/opencompass/NeedleBench)中自动加载,可以直接跳过下面的手动下载安放数据集。
|
| 22 |
+
|
| 23 |
+
1. 从[这里](https://github.com/open-compass/opencompass/files/14741330/needlebench.zip)下载数据集。
|
| 24 |
+
|
| 25 |
+
2. 将下载的文件放置于`opencompass/data/needlebench/`目录下。`needlebench`目录中预期的文件结构如下所示:
|
| 26 |
+
|
| 27 |
+
```
|
| 28 |
+
opencompass/
|
| 29 |
+
├── configs
|
| 30 |
+
├── docs
|
| 31 |
+
├── data
|
| 32 |
+
│ └── needlebench
|
| 33 |
+
│ ├── multi_needle_reasoning_en.json
|
| 34 |
+
│ ├── multi_needle_reasoning_zh.json
|
| 35 |
+
│ ├── names.json
|
| 36 |
+
│ ├── needles.jsonl
|
| 37 |
+
│ ├── PaulGrahamEssays.jsonl
|
| 38 |
+
│ ├── zh_finance.jsonl
|
| 39 |
+
│ ├── zh_game.jsonl
|
| 40 |
+
│ ├── zh_government.jsonl
|
| 41 |
+
│ ├── zh_movie.jsonl
|
| 42 |
+
│ ├── zh_tech.jsonl
|
| 43 |
+
│ ├── zh_general.jsonl
|
| 44 |
+
├── LICENSE
|
| 45 |
+
├── opencompass
|
| 46 |
+
├── outputs
|
| 47 |
+
├── run.py
|
| 48 |
+
├── more...
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### `OpenCompass`环境配置
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
| 55 |
+
conda activate opencompass
|
| 56 |
+
git clone https://github.com/open-compass/opencompass opencompass
|
| 57 |
+
cd opencompass
|
| 58 |
+
pip install -e .
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
### 配置数据集
|
| 62 |
+
|
| 63 |
+
我们在`configs/datasets/needlebench`中已经预先配置好了关于常见长度区间(4k, 8k, 32k, 128k, 200k, 1000k)的长文本测试设定,您可以通过在配置文件中定义相关参数,以灵活地创建适合您需求的数据集。
|
| 64 |
+
|
| 65 |
+
### 评估示例
|
| 66 |
+
|
| 67 |
+
#### 使用`LMDeploy`部署的 `InternLM2-7B` 模型进行评估
|
| 68 |
+
|
| 69 |
+
例如,使用`LMDeploy`部署的 `InternLM2-7B` 模型进行评估NeedleBench-4K的所有任务,可以在命令行中直接使用以下命令,该命令会调用预定义好的模型、数据集配置文件,而无需额外书写配置文件:
|
| 70 |
+
|
| 71 |
+
##### 本地评估
|
| 72 |
+
|
| 73 |
+
如果您在本地评估模型,下面命令会调用机器的所有可用GPU。您可以通过设置 `CUDA_VISIBLE_DEVICES` 环境变量来限制 `OpenCompass` 的 GPU 访问。例如,使用 `CUDA_VISIBLE_DEVICES=0,1,2,3 python run.py ...` 只会向 OpenCompass 暴露前四个 GPU,确保它同时使用的 GPU 数量不超过这四个。
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
# 本地评估
|
| 77 |
+
python run.py --dataset needlebench_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
##### 在Slurm集群上评估
|
| 81 |
+
|
| 82 |
+
如果使用 `Slurm`,可以添加 `--slurm -p partition_name -q reserved --max-num-workers 16 `等参数,例如下面:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
# Slurm评估
|
| 86 |
+
python run.py --dataset needlebench_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
##### 只评估子数据集
|
| 90 |
+
|
| 91 |
+
如果只想测试原始的大海捞针任务设定,比如可以更换数据集的参数为`needlebench_single_4k`,这对应于4k长度下的单针版本的大海捞��测试:
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
python run.py --dataset needlebench_single_4k --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
您也可以进一步选择子数据集,如更换数据集`--datasets`的参数为`needlebench_single_4k/needlebench_zh_datasets`,仅仅进行中文版本的单针4K长度下的大海捞针任务测试,其中`/`后面的参数代表子数据集,您可以在`configs/datasets/needlebench/needlebench_4k/needlebench_single_4k.py`中找到可选的子数据集变量,如:
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
python run.py --dataset needlebench_single_4k/needlebench_zh_datasets --models lmdeploy_internlm2_chat_7b --summarizer needlebench/needlebench_4k_summarizer --slurm -p partition_name -q reserved --max-num-workers 16
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
注意在评估前预先安装[LMDeploy](https://github.com/InternLM/lmdeploy)工具
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
pip install lmdeploy
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
这个命令将启动评估流程,参数 `-p partition_name -q auto` 和 `--max-num-workers 32` 用于指定 Slurm 分区名称和最大工作进程数。
|
| 110 |
+
|
| 111 |
+
#### 评估其他`Huggingface`模型
|
| 112 |
+
|
| 113 |
+
对于其他模型,我们建议额外书写一个运行的配置文件以便对模型的`max_seq_len`, `max_out_len`参数进行修改,以便模型可以接收到完整的长文本内容。如我们预先写好的`configs/eval_needlebench.py`文件。完整内容如下
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
from mmengine.config import read_base
|
| 117 |
+
# 我们使用mmengine.config来import其他的配置文件中的变量
|
| 118 |
+
|
| 119 |
+
with read_base():
|
| 120 |
+
# from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as internlm2_chat_7b_200k
|
| 121 |
+
from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_chat_7b
|
| 122 |
+
|
| 123 |
+
# Evaluate needlebench_4k, adjust the configuration to use 8k, 32k, 128k, 200k, or 1000k if necessary.
|
| 124 |
+
# from .datasets.needlebench.needlebench_4k.needlebench_4k import needlebench_datasets
|
| 125 |
+
# from .summarizers.needlebench import needlebench_4k_summarizer as summarizer
|
| 126 |
+
|
| 127 |
+
# only eval original "needle in a haystack test" in needlebench_4k
|
| 128 |
+
from .datasets.needlebench.needlebench_4k.needlebench_single_4k import needlebench_zh_datasets, needlebench_en_datasets
|
| 129 |
+
from .summarizers.needlebench import needlebench_4k_summarizer as summarizer
|
| 130 |
+
|
| 131 |
+
# eval Ancestral Tracing Challenge(ATC)
|
| 132 |
+
# from .datasets.needlebench.atc.atc_choice_50 import needlebench_datasets
|
| 133 |
+
# from .summarizers.needlebench import atc_summarizer_50 as summarizer
|
| 134 |
+
|
| 135 |
+
datasets = sum([v for k, v in locals().items() if ('datasets' in k)], [])
|
| 136 |
+
|
| 137 |
+
for m in internlm2_chat_7b:
|
| 138 |
+
m['max_seq_len'] = 30768 # 保证InternLM2-7B模型能接收到完整的长文本,其他模型需要根据各自支持的最大序列长度修改。
|
| 139 |
+
m['max_out_len'] = 2000 # 保证在多针召回任务中能接收到模型完整的回答
|
| 140 |
+
|
| 141 |
+
models = internlm2_chat_7b
|
| 142 |
+
|
| 143 |
+
work_dir = './outputs/needlebench'
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
当书写好测试的`config`文件后,我们可以命令行中通过`run.py`文件传入对应的config文件路径,例如:
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
python run.py configs/eval_needlebench.py --slurm -p partition_name -q reserved --max-num-workers 16
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
注意,此时我们不需传入`--dataset, --models, --summarizer `等参数,因为我们已经在config文件中定义了这些配置。你可以自己手动调节`--max-num-workers`的设定以调节并行工作的workers的数量。
|
| 153 |
+
|
| 154 |
+
### 可视化
|
| 155 |
+
|
| 156 |
+
我们已经在最新的代码中将结果可视化内置到`summarizer`实现中,您在对应的output文件夹的plots目录下可以看到相应的可视化。而不需要自己手动可视化各个深度和长度下的分数。
|
| 157 |
+
|
| 158 |
+
如果使用了该方法,请添加引用:
|
| 159 |
+
|
| 160 |
+
```bibtex
|
| 161 |
+
|
| 162 |
+
@misc{li2024needlebenchllmsretrievalreasoning,
|
| 163 |
+
title={NeedleBench: Can LLMs Do Retrieval and Reasoning in 1 Million Context Window?},
|
| 164 |
+
author={Mo Li and Songyang Zhang and Yunxin Liu and Kai Chen},
|
| 165 |
+
year={2024},
|
| 166 |
+
eprint={2407.11963},
|
| 167 |
+
archivePrefix={arXiv},
|
| 168 |
+
primaryClass={cs.CL},
|
| 169 |
+
url={https://arxiv.org/abs/2407.11963},
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
@misc{2023opencompass,
|
| 173 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
| 174 |
+
author={OpenCompass Contributors},
|
| 175 |
+
howpublished={\url{https://github.com/open-compass/opencompass}},
|
| 176 |
+
year={2023}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
@misc{LLMTest_NeedleInAHaystack,
|
| 180 |
+
title={LLMTest Needle In A Haystack - Pressure Testing LLMs},
|
| 181 |
+
author={gkamradt},
|
| 182 |
+
year={2023},
|
| 183 |
+
howpublished={\url{https://github.com/gkamradt/LLMTest_NeedleInAHaystack}}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
@misc{wei2023skywork,
|
| 187 |
+
title={Skywork: A More Open Bilingual Foundation Model},
|
| 188 |
+
author={Tianwen Wei and Liang Zhao and Lichang Zhang and Bo Zhu and Lijie Wang and Haihua Yang and Biye Li and Cheng Cheng and Weiwei Lü and Rui Hu and Chenxia Li and Liu Yang and Xilin Luo and Xuejie Wu and Lunan Liu and Wenjun Cheng and Peng Cheng and Jianhao Zhang and Xiaoyu Zhang and Lei Lin and Xiaokun Wang and Yutuan Ma and Chuanhai Dong and Yanqi Sun and Yifu Chen and Yongyi Peng and Xiaojuan Liang and Shuicheng Yan and Han Fang and Yahui Zhou},
|
| 189 |
+
year={2023},
|
| 190 |
+
eprint={2310.19341},
|
| 191 |
+
archivePrefix={arXiv},
|
| 192 |
+
primaryClass={cs.CL}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/new_dataset.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 支持新数据集
|
| 2 |
+
|
| 3 |
+
尽管 OpenCompass 已经包含了大多数常用数据集,用户在支持新数据集的时候需要完成以下几个步骤:
|
| 4 |
+
|
| 5 |
+
1. 在 `opencompass/datasets` 文件夹新增数据集脚本 `mydataset.py`, 该脚本需要包含:
|
| 6 |
+
|
| 7 |
+
- 数据集及其加载方式,需要定义一个 `MyDataset` 类,实现数据集加载方法 `load`,该方法为静态方法,需要返回 `datasets.Dataset` 类型的数据。这里我们使用 huggingface dataset 作为数据集的统一接口,避免引入额外的逻辑。具体示例如下:
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
import datasets
|
| 11 |
+
from .base import BaseDataset
|
| 12 |
+
|
| 13 |
+
class MyDataset(BaseDataset):
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
def load(**kwargs) -> datasets.Dataset:
|
| 17 |
+
pass
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
- (可选)如果 OpenCompass 已有的评测器不能满足需要,需要用户定义 `MyDatasetlEvaluator` 类,实现评分方法 `score`,需要根据输入的 `predictions` 和 `references` 列表,得到需要的字典。由于一个数据集可能存在多种 metric,需要返回一个 metrics 以及对应 scores 的相关字典。具体示例如下:
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
| 24 |
+
|
| 25 |
+
class MyDatasetlEvaluator(BaseEvaluator):
|
| 26 |
+
|
| 27 |
+
def score(self, predictions: List, references: List) -> dict:
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
- (可选)如果 OpenCompass 已有的后处理方法不能满足需要,需要用户定义 `mydataset_postprocess` 方法,根据输入的字符串得到相应后处理的结果。具体示例如下:
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
def mydataset_postprocess(text: str) -> str:
|
| 36 |
+
pass
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
2. 在定义好数据集加载、评测以及数据后处理等方法之后,需要在配置文件中新增以下配置:
|
| 40 |
+
|
| 41 |
+
```python
|
| 42 |
+
from opencompass.datasets import MyDataset, MyDatasetlEvaluator, mydataset_postprocess
|
| 43 |
+
|
| 44 |
+
mydataset_eval_cfg = dict(
|
| 45 |
+
evaluator=dict(type=MyDatasetlEvaluator),
|
| 46 |
+
pred_postprocessor=dict(type=mydataset_postprocess))
|
| 47 |
+
|
| 48 |
+
mydataset_datasets = [
|
| 49 |
+
dict(
|
| 50 |
+
type=MyDataset,
|
| 51 |
+
...,
|
| 52 |
+
reader_cfg=...,
|
| 53 |
+
infer_cfg=...,
|
| 54 |
+
eval_cfg=mydataset_eval_cfg)
|
| 55 |
+
]
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程,启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。
|
opencompass/docs/zh_cn/advanced_guides/new_model.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 支持新模型
|
| 2 |
+
|
| 3 |
+
目前我们已经支持的模型有 HF 模型、部分模型 API 、部分第三方模型。
|
| 4 |
+
|
| 5 |
+
## 新增API模型
|
| 6 |
+
|
| 7 |
+
新增基于API的模型,需要在 `opencompass/models` 下新建 `mymodel_api.py` 文件,继承 `BaseAPIModel`,并实现 `generate` 方法来进行推理,以及 `get_token_len` 方法来计算 token 的长度。在定义好之后修改对应配置文件名称即可。
|
| 8 |
+
|
| 9 |
+
```python
|
| 10 |
+
from ..base_api import BaseAPIModel
|
| 11 |
+
|
| 12 |
+
class MyModelAPI(BaseAPIModel):
|
| 13 |
+
|
| 14 |
+
is_api: bool = True
|
| 15 |
+
|
| 16 |
+
def __init__(self,
|
| 17 |
+
path: str,
|
| 18 |
+
max_seq_len: int = 2048,
|
| 19 |
+
query_per_second: int = 1,
|
| 20 |
+
retry: int = 2,
|
| 21 |
+
**kwargs):
|
| 22 |
+
super().__init__(path=path,
|
| 23 |
+
max_seq_len=max_seq_len,
|
| 24 |
+
meta_template=meta_template,
|
| 25 |
+
query_per_second=query_per_second,
|
| 26 |
+
retry=retry)
|
| 27 |
+
...
|
| 28 |
+
|
| 29 |
+
def generate(
|
| 30 |
+
self,
|
| 31 |
+
inputs,
|
| 32 |
+
max_out_len: int = 512,
|
| 33 |
+
temperature: float = 0.7,
|
| 34 |
+
) -> List[str]:
|
| 35 |
+
"""Generate results given a list of inputs."""
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
def get_token_len(self, prompt: str) -> int:
|
| 39 |
+
"""Get lengths of the tokenized string."""
|
| 40 |
+
pass
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## 新增第三方模型
|
| 44 |
+
|
| 45 |
+
新增基于第三方的模型,需要在 `opencompass/models` 下新建 `mymodel.py` 文件,继承 `BaseModel`,并实现 `generate` 方法来进行生成式推理, `get_ppl` 方法来进行判别式推理,以及 `get_token_len` 方法来计算 token 的长度。在定义好之后修改对应配置文件名称即可。
|
| 46 |
+
|
| 47 |
+
```python
|
| 48 |
+
from ..base import BaseModel
|
| 49 |
+
|
| 50 |
+
class MyModel(BaseModel):
|
| 51 |
+
|
| 52 |
+
def __init__(self,
|
| 53 |
+
pkg_root: str,
|
| 54 |
+
ckpt_path: str,
|
| 55 |
+
tokenizer_only: bool = False,
|
| 56 |
+
meta_template: Optional[Dict] = None,
|
| 57 |
+
**kwargs):
|
| 58 |
+
...
|
| 59 |
+
|
| 60 |
+
def get_token_len(self, prompt: str) -> int:
|
| 61 |
+
"""Get lengths of the tokenized strings."""
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
|
| 65 |
+
"""Generate results given a list of inputs. """
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
def get_ppl(self,
|
| 69 |
+
inputs: List[str],
|
| 70 |
+
mask_length: Optional[List[int]] = None) -> List[float]:
|
| 71 |
+
"""Get perplexity scores given a list of inputs."""
|
| 72 |
+
pass
|
| 73 |
+
```
|
opencompass/docs/zh_cn/advanced_guides/objective_judgelm_evaluation.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 用大模型做为JudgeLLM进行客观评测
|
| 2 |
+
|
| 3 |
+
## 介绍
|
| 4 |
+
|
| 5 |
+
通常的客观评测虽有标准答案作为参考,但是在实际应用中,模型预测结果可能因为模型指令遵循能力不同或后处理函数的不完善而产生差异,导致无法抽取到正确的答案并与标准答案进行对比。因此客观评测的结果可能并不完全准确。为了解决这一问题,我们参照主观评测,在预测完成后引入了JudgeLLM作为评价模型,以评估模型回答和标准答案的一致性。([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。
|
| 6 |
+
|
| 7 |
+
目前opencompass仓库里支持的所有模型都可以直接作为JudgeLLM进行调用,此外一些专用的JudgeLLM我们也在计划支持中。
|
| 8 |
+
|
| 9 |
+
## 目前已支持的用JudgeLLM进行直接评测的客观评测数据集
|
| 10 |
+
|
| 11 |
+
1. MATH(https://github.com/hendrycks/math)
|
| 12 |
+
|
| 13 |
+
## 自定义JudgeLLM客观数据集评测
|
| 14 |
+
|
| 15 |
+
目前的OpenCompass支持大部分采用`GenInferencer`的数据集进行推理。自定义JudgeLLM客观评测的具体流程包括:
|
| 16 |
+
|
| 17 |
+
1. 构建评测配置,使用API模型或者开源模型进行问题答案的推理
|
| 18 |
+
2. 使用选定的评价模型(JudgeLLM)对模型输出进行评估
|
| 19 |
+
|
| 20 |
+
### 第一步:构建评测配置,以MATH为例
|
| 21 |
+
|
| 22 |
+
下面是对MATH数据集进行JudgeLLM评测的Config,评测模型为*Llama3-8b-instruct*,JudgeLLM为*Llama3-70b-instruct*。更详细的config setting请参考 `configs/eval_math_llm_judge.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
|
| 23 |
+
|
| 24 |
+
```python
|
| 25 |
+
# Most of the code in this file is copied from https://github.com/openai/simple-evals/blob/main/math_eval.py
|
| 26 |
+
from mmengine.config import read_base
|
| 27 |
+
with read_base():
|
| 28 |
+
from .models.hf_llama.hf_llama3_8b_instruct import models as hf_llama3_8b_instruct_model # noqa: F401, F403
|
| 29 |
+
from .models.hf_llama.hf_llama3_70b_instruct import models as hf_llama3_70b_instruct_model # noqa: F401, F403
|
| 30 |
+
from .datasets.math.math_llm_judge import math_datasets # noqa: F401, F403
|
| 31 |
+
from opencompass.datasets import math_judement_preprocess
|
| 32 |
+
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
| 33 |
+
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
| 34 |
+
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
| 35 |
+
from opencompass.runners import LocalRunner
|
| 36 |
+
from opencompass.runners import SlurmSequentialRunner
|
| 37 |
+
from opencompass.tasks import OpenICLInferTask
|
| 38 |
+
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
| 39 |
+
from opencompass.summarizers import AllObjSummarizer
|
| 40 |
+
from opencompass.openicl.icl_evaluator import LMEvaluator
|
| 41 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ------------- Prompt设置 ----------------------------------------
|
| 45 |
+
# 评测模板,请根据需要修改模板,JudgeLLM默认采用[Yes]或[No]作为回答,在MATH数据集中,评测模板如下
|
| 46 |
+
eng_obj_prompt = """
|
| 47 |
+
Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications
|
| 48 |
+
|
| 49 |
+
Examples:
|
| 50 |
+
|
| 51 |
+
Expression 1: $2x+3$
|
| 52 |
+
Expression 2: $3+2x$
|
| 53 |
+
|
| 54 |
+
[Yes]
|
| 55 |
+
|
| 56 |
+
Expression 1: 3/2
|
| 57 |
+
Expression 2: 1.5
|
| 58 |
+
|
| 59 |
+
[Yes]
|
| 60 |
+
|
| 61 |
+
Expression 1: $x^2+2x+1$
|
| 62 |
+
Expression 2: $y^2+2y+1$
|
| 63 |
+
|
| 64 |
+
[No]
|
| 65 |
+
|
| 66 |
+
Expression 1: $x^2+2x+1$
|
| 67 |
+
Expression 2: $(x+1)^2$
|
| 68 |
+
|
| 69 |
+
[Yes]
|
| 70 |
+
|
| 71 |
+
Expression 1: 3245/5
|
| 72 |
+
Expression 2: 649
|
| 73 |
+
|
| 74 |
+
[No]
|
| 75 |
+
(these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)
|
| 76 |
+
|
| 77 |
+
Expression 1: 2/(-3)
|
| 78 |
+
Expression 2: -2/3
|
| 79 |
+
|
| 80 |
+
[Yes]
|
| 81 |
+
(trivial simplifications are allowed)
|
| 82 |
+
|
| 83 |
+
Expression 1: 72 degrees
|
| 84 |
+
Expression 2: 72
|
| 85 |
+
|
| 86 |
+
[Yes]
|
| 87 |
+
(give benefit of the doubt to units)
|
| 88 |
+
|
| 89 |
+
Expression 1: 64
|
| 90 |
+
Expression 2: 64 square feet
|
| 91 |
+
|
| 92 |
+
[Yes]
|
| 93 |
+
(give benefit of the doubt to units)
|
| 94 |
+
|
| 95 |
+
Expression 1: 64
|
| 96 |
+
Expression 2:
|
| 97 |
+
|
| 98 |
+
[No]
|
| 99 |
+
(only mark as equivalent if both expressions are nonempty)
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
YOUR TASK
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
Respond with only "[Yes]" or "[No]" (without quotes). Do not include a rationale.
|
| 107 |
+
Expression 1: {obj_gold}
|
| 108 |
+
Expression 2: {prediction}
|
| 109 |
+
|
| 110 |
+
"""
|
| 111 |
+
|
| 112 |
+
# -------------推理阶段 ----------------------------------------
|
| 113 |
+
# 需要评测的模型
|
| 114 |
+
models = [*hf_llama3_8b_instruct_model]
|
| 115 |
+
# 评价模型
|
| 116 |
+
judge_models = hf_llama3_70b_instruct_model
|
| 117 |
+
|
| 118 |
+
eng_datasets = [*math_datasets]
|
| 119 |
+
chn_datasets = []
|
| 120 |
+
datasets = eng_datasets + chn_datasets
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
for d in eng_datasets:
|
| 124 |
+
d['eval_cfg']= dict(
|
| 125 |
+
evaluator=dict(
|
| 126 |
+
type=LMEvaluator,
|
| 127 |
+
# 如果你需要在判断之前预处理模型预测,
|
| 128 |
+
# 你可以在这里指定pred_postprocessor函数
|
| 129 |
+
pred_postprocessor=dict(type=math_judement_preprocess),
|
| 130 |
+
prompt_template=dict(
|
| 131 |
+
type=PromptTemplate,
|
| 132 |
+
template=dict(round=[
|
| 133 |
+
dict(
|
| 134 |
+
role='HUMAN',
|
| 135 |
+
prompt = eng_obj_prompt
|
| 136 |
+
),
|
| 137 |
+
]),
|
| 138 |
+
),
|
| 139 |
+
),
|
| 140 |
+
pred_role="BOT",
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
infer = dict(
|
| 144 |
+
partitioner=dict(type=SizePartitioner, max_task_size=40000),
|
| 145 |
+
runner=dict(
|
| 146 |
+
type=LocalRunner,
|
| 147 |
+
max_num_workers=256,
|
| 148 |
+
task=dict(type=OpenICLInferTask)),
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# ------------- 评测配置 --------------------------------
|
| 152 |
+
eval = dict(
|
| 153 |
+
partitioner=dict(
|
| 154 |
+
type=SubjectiveSizePartitioner, max_task_size=80000, mode='singlescore', models=models, judge_models=judge_models,
|
| 155 |
+
),
|
| 156 |
+
runner=dict(type=LocalRunner,
|
| 157 |
+
max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
summarizer = dict(
|
| 161 |
+
type=AllObjSummarizer
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
# 输出文件夹
|
| 165 |
+
work_dir = 'outputs/obj_all/'
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
### 第二步 启动评测并输出评测结果
|
| 169 |
+
|
| 170 |
+
```shell
|
| 171 |
+
python run.py eval_math_llm_judge.py
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
此时会进行两轮评测,第一轮是模型推理得到问题的预测答案,第二轮是JudgeLLM评测预测答案和标准答案的一致性,并打分。
|
| 175 |
+
|
| 176 |
+
- 模型预测的结果会保存在 `output/.../timestamp/predictions/xxmodel/xxx.json`
|
| 177 |
+
- JudgeLLM的评测回复会保存在 `output/.../timestamp/results/xxmodel/xxx.json`
|
| 178 |
+
- 评测报告则会输出到 `output/.../timestamp/summary/timestamp/xxx.csv`
|
| 179 |
+
|
| 180 |
+
## 评测结果
|
| 181 |
+
|
| 182 |
+
采用Llama3-8b-instruct作为评价模型,Llama3-70b-instruct作为评价器,对MATH数据集进行评价,结果如下:
|
| 183 |
+
|
| 184 |
+
| Model | JudgeLLM Evaluation | Naive Evaluation |
|
| 185 |
+
| ------------------- | ------------------- | ---------------- |
|
| 186 |
+
| llama-3-8b-instruct | 27.7 | 27.8 |
|