diff --git a/lm-evaluation-harness/.coveragerc b/lm-evaluation-harness/.coveragerc deleted file mode 100644 index 1248476304d3d43662439148724428792f150585..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.coveragerc +++ /dev/null @@ -1,28 +0,0 @@ -[run] - -# tasks that aren't wired up. -omit = - lm_eval/tasks/quac.py - lm_eval/tasks/storycloze.py - lm_eval/tasks/cbt.py - lm_eval/tasks/sat.py - lm_eval/tasks/triviaqa.py - lm_eval/tasks/naturalqs.py - lm_eval/models/dummy.py - -[report] -exclude_lines = - # Skip any pass lines such as may be used for @abstractmethod - pass - - # Have to re-enable the standard pragma - pragma: no cover - - # Don't complain about missing debug-only code: - def __repr__ - if self\.debug - - # Don't complain if tests don't hit defensive assertion code: - raise AssertionError - raise NotImplementedError - return NotImplemented diff --git a/lm-evaluation-harness/.flake8 b/lm-evaluation-harness/.flake8 deleted file mode 100644 index 73f6455d132003fce0034f41d72eeb901b68f039..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.flake8 +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -ignore = E203, E266, E501, W503, F403, F401, C901 -max-line-length = 127 -max-complexity = 10 -select = B,C,E,F,W,T4,B9 diff --git a/lm-evaluation-harness/.github/workflows/new_tasks.yml b/lm-evaluation-harness/.github/workflows/new_tasks.yml deleted file mode 100644 index 79567bfb8d4ef45ad1a4ef9aca62b8e24ca8080a..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.github/workflows/new_tasks.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: Tasks Modified - -on: - push: - branches: - - 'main' - pull_request: - branches: - - 'main' - workflow_dispatch: -# comment/edit out the above to stop/change the triggers -jobs: - changed_files: - runs-on: ubuntu-latest # windows-latest || macos-latest - timeout-minutes: 120 - name: Scan for changed tasks - steps: - - name: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. - - # Uses the tj-actions/changed-files action to check for changes. - # The `files_yaml` input optionally takes a yaml string to specify filters, - # and prepends the filter name to the standard output names. - - name: Check task folders - id: changed-tasks - uses: tj-actions/changed-files@v46.0.5 - with: - # tasks checks the tasks folder and api checks the api folder for changes - files_yaml: | - tasks: - - lm_eval/tasks/** - api: - - lm_eval/api/** - write_output_files: true - - # The next step is optional; the files are written to the workspace by default (above). - # so it's just for debugging - - name: Run Tests - if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' - run: | - echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV' - echo "One or more test file(s) has changed." - echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}" - - - name: Set up Python 3.9 - if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' - uses: actions/setup-python@v5 - with: - python-version: 3.9 - cache: 'pip' - cache-dependency-path: setup.py - - name: Install dependencies - if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' - run: | - python -m pip install --upgrade pip - pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu - # Install optional git dependencies - # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt - # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Test with pytest - # if new tasks are added, run tests on them - if: steps.changed-tasks.outputs.tasks_any_modified == 'true' - run: python -m pytest tests/test_tasks.py -s -vv - # if api is modified, run tests on it - - name: Test more tasks with pytest - env: - API: true - if: steps.changed-tasks.outputs.api_any_modified == 'true' - run: python -m pytest tests/test_tasks.py -s -vv diff --git a/lm-evaluation-harness/.github/workflows/publish.yml b/lm-evaluation-harness/.github/workflows/publish.yml deleted file mode 100644 index a053961669ed99ce82bf3546ad93609a1b94283f..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.github/workflows/publish.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: Publish Python distribution to PyPI - -on: - push: - tags: - - '*' - -jobs: - build: - name: Build distribution - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - - name: Check version consistency - run: | - # Extract version from pyproject.toml - PYPROJECT_VERSION=$(grep 'version = ' pyproject.toml | head -1 | cut -d'"' -f2) - - # Extract version from __init__.py - INIT_VERSION=$(grep '__version__ = ' lm_eval/__init__.py | head -1 | cut -d'"' -f2) - - echo "Version in pyproject.toml: $PYPROJECT_VERSION" - echo "Version in __init__.py: $INIT_VERSION" - - # Check if versions match - if [ "$PYPROJECT_VERSION" != "$INIT_VERSION" ]; then - echo "Error: Version mismatch between pyproject.toml ($PYPROJECT_VERSION) and __init__.py ($INIT_VERSION)" - exit 1 - fi - - echo "Version check passed: $PYPROJECT_VERSION" - - - name: Install pypa/build - run: >- - python3 -m - pip install - build - --user - - name: Build a binary wheel and a source tarball - run: python3 -m build - - name: Store the distribution packages - uses: actions/upload-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - publish-to-pypi: - name: >- - Publish Python distribution to PyPI - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes - needs: - - build - runs-on: ubuntu-latest - environment: - name: pypi - url: https://pypi.org/p/lm_eval - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - - publish-to-testpypi: - name: Publish Python distribution to TestPyPI - needs: - - build - runs-on: ubuntu-latest - - environment: - name: testpypi - url: https://test.pypi.org/p/lm_eval - - permissions: - id-token: write # IMPORTANT: mandatory for trusted publishing - - steps: - - name: Download all the dists - uses: actions/download-artifact@v4 - with: - name: python-package-distributions - path: dist/ - - name: Publish distribution to TestPyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - repository-url: https://test.pypi.org/legacy/ diff --git a/lm-evaluation-harness/.github/workflows/unit_tests.yml b/lm-evaluation-harness/.github/workflows/unit_tests.yml deleted file mode 100644 index b9a448642dbb5ef70339a324a6b58641942b2506..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.github/workflows/unit_tests.yml +++ /dev/null @@ -1,114 +0,0 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -# just comment out unwanted steps to turn off the test. -name: Unit Tests - -on: - push: - branches: - - 'main' - pull_request: - branches: - - 'main' - workflow_dispatch: -# Jobs run concurrently and steps run sequentially within a job. -# jobs: linter and cpu_tests. Add more jobs/steps as required. -jobs: - linter: - name: Linters - runs-on: ubuntu-latest - timeout-minutes: 5 - - steps: - - name: Checkout Code - uses: actions/checkout@v4 - - name: Set up Python 3.9 - uses: actions/setup-python@v5 - with: - python-version: 3.9 - cache: pip - cache-dependency-path: pyproject.toml - - name: Pre-Commit - env: - SKIP: "no-commit-to-branch,mypy" - uses: pre-commit/action@v3.0.1 - # Job 2 - testcpu: - name: CPU Tests - runs-on: ubuntu-latest - strategy: - fail-fast: true - matrix: - python-version: ["3.9", "3.10", "3.11"] - timeout-minutes: 30 - steps: - - name: Checkout Code - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: pip - cache-dependency-path: pyproject.toml - - # Cache HuggingFace cache directory for CPU tests - - name: Cache HuggingFace cache (CPU tests) - uses: actions/cache@v3 - id: cache-hf-cpu - with: - path: ~/.cache/huggingface - key: ${{ runner.os }}-hf-cache-cpu - restore-keys: | - ${{ runner.os }}-hf-cache-cpu - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu - pip install hf_xet - - - name: Test with pytest - run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py - continue-on-error: true # Continue workflow even if tests fail - - # Save test artifacts - - name: Archive test artifacts - uses: actions/upload-artifact@v4 - with: - name: output_testcpu${{ matrix.python-version }} - path: | - test_logs/* - -# testmodels: -# name: External LM Tests -# runs-on: ubuntu-latest -# timeout-minutes: 30 -# steps: -# - name: Checkout Code -# uses: actions/checkout@v4 -# - name: Set up Python 3.9 -# uses: actions/setup-python@v5 -# with: -# python-version: 3.9 -# cache: pip -# cache-dependency-path: pyproject.toml -# -# # Cache HuggingFace cache directory for External LM tests -# - name: Cache HuggingFace cache (External LM tests) -# uses: actions/cache@v3 -# id: cache-hf-lm -# with: -# path: ~/.cache/huggingface -# key: ${{ runner.os }}-hf-cache-external-lm -# restore-keys: | -# ${{ runner.os }}-hf-cache-external-lm -# -# - name: Install dependencies -# run: | -# python -m pip install --upgrade pip -# pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu -# pip install -U transformers peft accelerate -# -# - name: Test with pytest -# run: python -m pytest tests/models --showlocals -s -vv -# continue-on-error: true # Continue workflow even if tests fail diff --git a/lm-evaluation-harness/.gitignore b/lm-evaluation-harness/.gitignore deleted file mode 100644 index 9ae167be97686d8e332e469e3d84708879860091..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.gitignore +++ /dev/null @@ -1,47 +0,0 @@ -# macOS system files -.DS_Store - -# Virtual environments -.venv/ -venv/ -ENV/ -env/ -*.env - -# Python bytecode and build artifacts -__pycache__/ -*.py[cod] -*.so -*.egg-info/ -build/ -dist/ - -# IDE & editor settings -.vscode/ -.idea/ - -# Jupyter -.ipynb_checkpoints/ -profile_default/ -ipython_config.py - -# Output and data -output/ -data/ -temp/ -test_logs/ - -# Caching -lm_eval/caching/.cache -lm_cache/ - -# Logging -*.log -logs/ - -# wandb experiment tracking -wandb/ -examples/wandb/ - -# PyInstaller -*.spec diff --git a/lm-evaluation-harness/.pre-commit-config.yaml b/lm-evaluation-harness/.pre-commit-config.yaml deleted file mode 100644 index 1aecc758adaf13372960a13ed42861e002b9dbbb..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/.pre-commit-config.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# Ignore test linting to avoid conflicting changes to version stability. -exclude: ^tests/testdata/ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 - hooks: - - id: check-added-large-files - - id: check-ast - - id: check-byte-order-marker - - id: check-case-conflict - - id: check-json - - id: check-merge-conflict - args: [--assume-in-merge] - - id: check-symlinks - - id: check-yaml - args: ["--unsafe"] - - id: destroyed-symlinks - - id: detect-private-key - - id: end-of-file-fixer - - id: no-commit-to-branch - always_run: false - - id: requirements-txt-fixer - - id: trailing-whitespace - args: [--markdown-linebreak-ext=md] - - id: fix-byte-order-marker - exclude: docs/CNAME - - id: fix-encoding-pragma - args: [--remove] - - id: mixed-line-ending - args: [--fix=lf] - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.0 - hooks: - # Run the linter. - - id: ruff - args: - - --fix - # Run the formatter. - - id: ruff-format - - repo: https://github.com/codespell-project/codespell - rev: v2.4.1 - hooks: - - id: codespell - exclude: > - (?x)^( - .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb - )$ - args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] - - repo: https://github.com/jackdewinter/pymarkdown - rev: v0.9.29 - hooks: - - id: pymarkdown - exclude: ^lm_eval/tasks/ - args: [fix, -r] -# - repo: https://github.com/pre-commit/mirrors-mypy -# rev: v1.5.1 -# hooks: -# - id: mypy -# additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"] -# exclude: ^tests/.*$ diff --git a/lm-evaluation-harness/CITATION.bib b/lm-evaluation-harness/CITATION.bib deleted file mode 100644 index 4ec33f139693aad74d2cb89c5edb2a578a315dd2..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/CITATION.bib +++ /dev/null @@ -1,10 +0,0 @@ -@misc{eval-harness, - author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, - title = {A framework for few-shot language model evaluation}, - month = 12, - year = 2023, - publisher = {Zenodo}, - version = {v0.4.0}, - doi = {10.5281/zenodo.10256836}, - url = {https://zenodo.org/records/10256836} -} diff --git a/lm-evaluation-harness/CODEOWNERS b/lm-evaluation-harness/CODEOWNERS deleted file mode 100644 index 32f61c33f06047800239d50416a2f86a8f63c6bd..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/CODEOWNERS +++ /dev/null @@ -1 +0,0 @@ -* @baberabb @stellaathena diff --git a/lm-evaluation-harness/LICENSE.md b/lm-evaluation-harness/LICENSE.md deleted file mode 100644 index 12e6063183935e876e232db276568baf4954b492..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/LICENSE.md +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2020 EleutherAI - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/lm-evaluation-harness/MANIFEST.in b/lm-evaluation-harness/MANIFEST.in deleted file mode 100644 index 93f181def4d46b013259fae732c03ce172127da8..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -recursive-include tests diff --git a/lm-evaluation-harness/README.md b/lm-evaluation-harness/README.md deleted file mode 100644 index f325ae478dad8bba17e8a35d0ec940834e545f63..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/README.md +++ /dev/null @@ -1,625 +0,0 @@ -# Language Model Evaluation Harness - -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836) - ---- - -## Latest News 📣 - -- [2025/03] Added support for steering HF models! -- [2025/02] Added [SGLang](https://docs.sglang.ai/) support! -- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features. -- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.** -- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group. - ---- - -## Announcement - -**A new v0.4.0 release of lm-evaluation-harness is available** ! - -New updates and features include: - -- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.** -- Internal refactoring -- Config-based task creation and configuration -- Easier import and sharing of externally-defined task config YAMLs -- Support for Jinja2 prompt design, easy modification of prompts + prompt imports from Promptsource -- More advanced configuration options, including output post-processing, answer extraction, and multiple LM generations per document, configurable fewshot settings, and more -- Speedups and new modeling libraries supported, including: faster data-parallel HF model usage, vLLM support, MPS support with HuggingFace, and more -- Logging and usability changes -- New tasks including CoT BIG-Bench-Hard, Belebele, user-defined task groupings, and more - -Please see our updated documentation pages in `docs/` for more details. - -Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)! - ---- - -## Overview - -This project provides a unified framework to test generative language models on a large number of different evaluation tasks. - -**Features:** - -- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented. -- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface. -- Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm). -- Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/). -- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft). -- Support for local models and benchmarks. -- Evaluation with publicly available prompts ensures reproducibility and comparability between papers. -- Easy support for custom prompts and evaluation metrics. - -The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML. - -## Install - -To install the `lm-eval` package from the github repository, run: - -```bash -git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness -cd lm-evaluation-harness -pip install -e . -``` - -We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document. - -## Basic Usage - -### User Guide - -A user guide detailing the full list of supported arguments is provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`. - -A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --tasks list`. Task descriptions and links to corresponding subfolders are provided [here](./lm_eval/tasks/README.md). - -### Hugging Face `transformers` - -To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU): - -```bash -lm_eval --model hf \ - --model_args pretrained=EleutherAI/gpt-j-6B \ - --tasks hellaswag \ - --device cuda:0 \ - --batch_size 8 -``` - -Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model: - -```bash -lm_eval --model hf \ - --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \ - --tasks lambada_openai,hellaswag \ - --device cuda:0 \ - --batch_size 8 -``` - -Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported. - -Batch size selection can be automated by setting the ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be: - -```bash -lm_eval --model hf \ - --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \ - --tasks lambada_openai,hellaswag \ - --device cuda:0 \ - --batch_size auto:4 -``` - -> [!Note] -> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model` - -#### Multi-GPU Evaluation with Hugging Face `accelerate` - -We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation. - -To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows: - -```bash -accelerate launch -m lm_eval --model hf \ - --tasks lambada_openai,arc_easy \ - --batch_size 16 -``` - -(or via `accelerate launch --no-python lm_eval`). - -For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one. - -**WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used. - -The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.* - -In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows: - -```bash -lm_eval --model hf \ - --tasks lambada_openai,arc_easy \ - --model_args parallelize=True \ - --batch_size 16 -``` - -This means that your model's weights will be split across all available GPUs. - -For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well: - -- `device_map_option`: How to split model weights across available GPUs. defaults to "auto". -- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model. -- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM. -- `offload_folder`: a folder where model weights will be offloaded to disk if needed. - -The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU. - -```bash -accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \ - -m lm_eval --model hf \ - --tasks lambada_openai,arc_easy \ - --model_args parallelize=True \ - --batch_size 16 -``` - -To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism) - -**Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.** - -**Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).** - -### Steered Hugging Face `transformers` models - -To evaluate a Hugging Face `transformers` model with steering vectors applied, specify the model type as `steered` and provide the path to either a PyTorch file containing pre-defined steering vectors, or a CSV file that specifies how to derive steering vectors from pretrained `sparsify` or `sae_lens` models (you will need to install the corresponding optional dependency for this method). - -Specify pre-defined steering vectors: - -```python -import torch - -steer_config = { - "layers.3": { - "steering_vector": torch.randn(1, 768), - "bias": torch.randn(1, 768), - "steering_coefficient": 1, - "action": "add" - }, -} -torch.save(steer_config, "steer_config.pt") -``` - -Specify derived steering vectors: - -```python -import pandas as pd - -pd.DataFrame({ - "loader": ["sparsify"], - "action": ["add"], - "sparse_model": ["EleutherAI/sae-pythia-70m-32k"], - "hookpoint": ["layers.3"], - "feature_index": [30], - "steering_coefficient": [10.0], -}).to_csv("steer_config.csv", index=False) -``` - -Run the evaluation harness with steering vectors applied: - -```bash -lm_eval --model steered \ - --model_args pretrained=EleutherAI/pythia-160m,steer_path=steer_config.pt \ - --tasks lambada_openai,hellaswag \ - --device cuda:0 \ - --batch_size 8 -``` - -### NVIDIA `nemo` models - -[NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo) is a generative AI framework built for researchers and pytorch developers working on language models. - -To evaluate a `nemo` model, start by installing NeMo following [the documentation](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#installation). We highly recommended to use the NVIDIA PyTorch or NeMo container, especially if having issues installing Apex or any other dependencies (see [latest released containers](https://github.com/NVIDIA/NeMo/releases)). Please also install the lm evaluation harness library following the instructions in [the Install section](https://github.com/EleutherAI/lm-evaluation-harness/tree/main?tab=readme-ov-file#install). - -NeMo models can be obtained through [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/models) or in [NVIDIA's Hugging Face page](https://huggingface.co/nvidia). In [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling) there are conversion scripts to convert the `hf` checkpoints of popular models like llama, falcon, mixtral or mpt to `nemo`. - -Run a `nemo` model on one GPU: - -```bash -lm_eval --model nemo_lm \ - --model_args path= \ - --tasks hellaswag \ - --batch_size 32 -``` - -It is recommended to unpack the `nemo` model to avoid the unpacking inside the docker container - it may overflow disk space. For that you can run: - -```bash -mkdir MY_MODEL -tar -xvf MY_MODEL.nemo -c MY_MODEL -``` - -#### Multi-GPU evaluation with NVIDIA `nemo` models - -By default, only one GPU is used. But we do support either data replication or tensor/pipeline parallelism during evaluation, on one node. - -1) To enable data replication, set the `model_args` of `devices` to the number of data replicas to run. For example, the command to run 8 data replicas over 8 GPUs is: - -```bash -torchrun --nproc-per-node=8 --no-python lm_eval \ - --model nemo_lm \ - --model_args path=,devices=8 \ - --tasks hellaswag \ - --batch_size 32 -``` - -1) To enable tensor and/or pipeline parallelism, set the `model_args` of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. In addition, you also have to set up `devices` to be equal to the product of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. For example, the command to use one node of 4 GPUs with tensor parallelism of 2 and pipeline parallelism of 2 is: - -```bash -torchrun --nproc-per-node=4 --no-python lm_eval \ - --model nemo_lm \ - --model_args path=,devices=4,tensor_model_parallel_size=2,pipeline_model_parallel_size=2 \ - --tasks hellaswag \ - --batch_size 32 -``` - -Note that it is recommended to substitute the `python` command by `torchrun --nproc-per-node= --no-python` to facilitate loading the model into the GPUs. This is especially important for large checkpoints loaded into multiple GPUs. - -Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism. - -#### Multi-GPU evaluation with OpenVINO models - -Pipeline parallelism during evaluation is supported with OpenVINO models - -To enable pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:,` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is: - -```bash -lm_eval --model openvino \ - --tasks wikitext \ - --model_args pretrained=,pipeline_parallel=True \ - --device HETERO:GPU.1,GPU.0 -``` - -### Tensor + Data Parallel and Optimized Inference with `vLLM` - -We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example: - -```bash -lm_eval --model vllm \ - --model_args pretrained={model_name},tensor_parallel_size={GPUs_per_model},dtype=auto,gpu_memory_utilization=0.8,data_parallel_size={model_replicas} \ - --tasks lambada_openai \ - --batch_size auto -``` - -To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation. - -vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF. - -> [!Tip] -> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality! - -> [!Tip] -> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k. - -### Tensor + Data Parallel and Fast Offline Batching Inference with `SGLang` - -We support SGLang for efficient offline batch inference. Its **[Fast Backend Runtime](https://docs.sglang.ai/index.html)** delivers high performance through optimized memory management and parallel processing techniques. Key features include tensor parallelism, continuous batching, and support for various quantization methods (FP8/INT4/AWQ/GPTQ). - -To use SGLang as the evaluation backend, please **install it in advance** via SGLang documents [here](https://docs.sglang.ai/start/install.html#install-sglang). - -> [!Tip] -> Due to the installing method of [`Flashinfer`](https://docs.flashinfer.ai/)-- a fast attention kernel library, we don't include the dependencies of `SGLang` within [pyproject.toml](pyproject.toml). Note that the `Flashinfer` also has some requirements on `torch` version. - -SGLang's server arguments are slightly different from other backends, see [here](https://docs.sglang.ai/backend/server_arguments.html) for more information. We provide an example of the usage here: - -```bash -lm_eval --model sglang \ - --model_args pretrained={model_name},dp_size={data_parallel_size},tp_size={tensor_parallel_size},dtype=auto \ - --tasks gsm8k_cot \ - --batch_size auto -``` - -> [!Tip] -> When encountering out of memory (OOM) errors (especially for multiple-choice tasks), try these solutions: -> -> 1. Use a manual `batch_size`, rather than `auto`. -> 2. Lower KV cache pool memory usage by adjusting `mem_fraction_static` - Add to your model arguments for example `--model_args pretrained=...,mem_fraction_static=0.7`. -> 3. Increase tensor parallel size `tp_size` (if using multiple GPUs). - -### Model APIs and Inference Servers - -Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers. - -To call a hosted model, use: - -```bash -export OPENAI_API_KEY=YOUR_KEY_HERE -lm_eval --model openai-completions \ - --model_args model=davinci-002 \ - --tasks lambada_openai,hellaswag -``` - -We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs. - -```bash -lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16 -``` - -Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support. - -| API or Inference Server | Implemented? | `--model ` name | Models supported: | Request Types: | -| --------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------| -| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) | -| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) | -| Anthropic Chat | :heavy_check_mark: | `anthropic-chat`, `anthropic-chat-completions` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview) | `generate_until` (no logprobs) | -| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) | -| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Huggingface Optimum (Causal LMs) | :heavy_check_mark: | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Huggingface Optimum-intel IPEX (Causal LMs) | :heavy_check_mark: | `ipex` | Any decoder-only AutoModelForCausalLM | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Neuron via AWS Inf2 (Causal LMs) | :heavy_check_mark: | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse) | :heavy_check_mark: | `deepsparse` | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse) | `generate_until`, `loglikelihood` | -| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml) | :heavy_check_mark: | `sparseml` | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| NVIDIA NeMo | :heavy_check_mark: | `nemo_lm` | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | -| Watsonx.ai | :heavy_check_mark: | `watsonx_llm` | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx) | `generate_until` `loglikelihood` | -| [Your local inference server!](docs/API_guide.md) | :heavy_check_mark: | `local-completions` or `local-chat-completions` | Support for OpenAI API-compatible servers, with easy customization for other APIs. | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | - -Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`. - -For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface). - -> [!Note] -> For best performance with closed chat model APIs such as Anthropic Claude 3 and GPT-4, we recommend carefully looking at a few sample outputs using `--limit 10` first to confirm answer extraction and scoring on generative tasks is performing as expected. providing `system=""` within `--model_args` for anthropic-chat-completions, to instruct the model what format to respond in, may be useful. - -### Other Frameworks - -A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py). - -To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage). - -### Additional Features - -> [!Note] -> For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation. - -If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). **Note that the PyTorch MPS backend is still in early stages of development, so correctness issues or unsupported operations may exist. If you observe oddities in model performance on the MPS back-end, we recommend first checking that a forward pass of your model on `--device cpu` and `--device mps` match.** - -> [!Note] -> You can inspect what the LM inputs look like by running the following command: -> -> ```bash -> python write_out.py \ -> --tasks \ -> --num_fewshot 5 \ -> --num_examples 10 \ -> --output_base_path /path/to/output/folder -> ``` -> -> This will write out one text file for each task. - -To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag: - -```bash -lm_eval --model openai \ - --model_args engine=davinci-002 \ - --tasks lambada_openai,hellaswag \ - --check_integrity -``` - -## Advanced Usage Tips - -For models loaded with the HuggingFace `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument: - -```bash -lm_eval --model hf \ - --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \ - --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \ - --device cuda:0 -``` - -Models provided as delta weights can be easily loaded using the Hugging Face transformers library. Within --model_args, set the delta argument to specify the delta weights, and use the pretrained argument to designate the relative base model to which they will be applied: - -```bash -lm_eval --model hf \ - --model_args pretrained=Ejafa/llama_7B,delta=lmsys/vicuna-7b-delta-v1.1 \ - --tasks hellaswag -``` - -GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) - -GPTQModel: add `,gptqmodel=True` to `model_args` - -```bash -lm_eval --model hf \ - --model_args pretrained=model-name-or-path,gptqmodel=True \ - --tasks hellaswag -``` - -AutoGPTQ: add `,autogptq=True` to `model_args`: - -```bash -lm_eval --model hf \ - --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \ - --tasks hellaswag -``` - -We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`. - -## Saving & Caching Results - -To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis. - -> [!TIP] -> Use `--use_cache ` to cache evaluation results and skip previously evaluated samples when resuming runs of the same (model, task) pairs. Note that caching is rank-dependent, so restart with the same GPU count if interrupted. You can also use --cache_requests to save dataset preprocessing steps for faster evaluation resumption. - -To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance: - -```bash -lm_eval --model hf \ - --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \ - --tasks hellaswag \ - --log_samples \ - --output_path results \ - --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \ -``` - -This allows you to easily download the results and samples from the Hub, using: - -```python -from datasets import load_dataset - -load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest") -``` - -For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation! - -## Visualizing Results - -You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno. - -### Zeno - -You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs. - -First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account). -Add this key as an environment variable: - -```bash -export ZENO_API_KEY=[your api key] -``` - -You'll also need to install the `lm_eval[zeno]` package extra. - -To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. -We expect `output_path` to contain multiple folders that represent individual model names. -You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno. - -```bash -lm_eval \ - --model hf \ - --model_args pretrained=EleutherAI/gpt-j-6B \ - --tasks hellaswag \ - --device cuda:0 \ - --batch_size 8 \ - --log_samples \ - --output_path output/gpt-j-6B -``` - -Then, you can upload the resulting data using the `zeno_visualize` script: - -```bash -python scripts/zeno_visualize.py \ - --data_path output \ - --project_name "Eleuther Project" -``` - -This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. -If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task. - -You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb). - -### Weights and Biases - -With the [Weights and Biases](https://wandb.ai/site) integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform. - -The integration provide functionalities - -- to automatically log the evaluation results, -- log the samples as W&B Tables for easy visualization, -- log the `results.json` file as an artifact for version control, -- log the `_eval_samples.json` file if the samples are logged, -- generate a comprehensive report for analysis and visualization with all the important metric, -- log task and cli specific configs, -- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc. - -First you'll need to install the lm_eval[wandb] package extra. Do `pip install lm_eval[wandb]`. - -Authenticate your machine with an your unique W&B token. Visit https://wandb.ai/authorize to get one. Do `wandb login` in your command line terminal. - -Run eval harness as usual with a `wandb_args` flag. Use this flag to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments. - -```bash -lm_eval \ - --model hf \ - --model_args pretrained=microsoft/phi-2,trust_remote_code=True \ - --tasks hellaswag,mmlu_abstract_algebra \ - --device cuda:0 \ - --batch_size 8 \ - --output_path output/phi-2 \ - --limit 10 \ - --wandb_args project=lm-eval-harness-integration \ - --log_samples -``` - -In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI. - -## How to Contribute or Learn More? - -For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help. - -### Implementing new tasks - -To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md). - -In general, we follow this priority list for addressing concerns about prompting and other eval details: - -1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure. -2. If there is a clear and unambiguous official implementation, use that procedure. -3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure. -4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers. - -These are guidelines and not rules, and can be overruled in special circumstances. - -We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) as our original goal was specifically to compare results with that paper. - -### Support - -The best way to get support is to open an issue on this repo or join the [EleutherAI Discord server](https://discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases. If you've used the library and have had a positive (or negative) experience, we'd love to hear from you! - -## Optional Extras - -Extras dependencies can be installed via `pip install -e ".[NAME]"` - -| Name | Use | -| -------------------- | ----------------------------------------------------- | -| api | For using api models (Anthropic, OpenAI API) | -| audiolm_qwen | For running Qwen2 audio models | -| deepsparse | For running NM's DeepSparse models | -| dev | For linting PRs and contributions | -| gptq | For loading models with AutoGPTQ | -| gptqmodel | For loading models with GPTQModel | -| hf_transfer | For speeding up HF Hub file downloads | -| ibm_watsonx_ai | For using IBM watsonx.ai model apis | -| ifeval | For running the IFEval task | -| ipex | For running on optimum-intel ipex backend | -| japanese_leaderboard | For running Japanese LLM Leaderboard tasks | -| longbench | For running LongBench tasks | -| mamba | For loading Mamba SSM models | -| math | For running math task answer checking | -| multilingual | For multilingual tokenizers | -| neuronx | For running on AWS inf2 instances | -| optimum | For running Intel OpenVINO models | -| promptsource | For using PromptSource prompts | -| ruler | For running RULER tasks | -| sae_lens | For using SAELens to steer models | -| sentencepiece | For using the sentencepiece tokenizer | -| sparseml | For using NM's SparseML models | -| sparsify | For using Sparsify to steer models | -| testing | For running library test suite | -| vllm | For loading models with vLLM | -| wandb | For integration with `Weights and Biases` platform | -| zeno | For visualizing results with Zeno | -| -------------------- | ----------------------------------------------------- | -| all | Loads all extras (not recommended) | - -## Cite as - -```text -@misc{eval-harness, - author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, - title = {The Language Model Evaluation Harness}, - month = 07, - year = 2024, - publisher = {Zenodo}, - version = {v0.4.3}, - doi = {10.5281/zenodo.12608602}, - url = {https://zenodo.org/records/12608602} -} -``` diff --git a/lm-evaluation-harness/ignore.txt b/lm-evaluation-harness/ignore.txt deleted file mode 100644 index de10b539b98c9e500d2d838ed3eb9bece95c00e2..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/ignore.txt +++ /dev/null @@ -1,8 +0,0 @@ -ROUGE -rouge -nin -maka -mor -te -ond -extraversion diff --git a/lm-evaluation-harness/lm_eval/__init__.py b/lm-evaluation-harness/lm_eval/__init__.py deleted file mode 100644 index fece9162482d018c2969a9e67603096d0ad21713..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -import logging -import os - -from .evaluator import evaluate, simple_evaluate - - -__version__ = "0.4.8" diff --git a/lm-evaluation-harness/lm_eval/__main__.py b/lm-evaluation-harness/lm_eval/__main__.py deleted file mode 100644 index f1faae483084aedeb25c023d5b4a42e3d8216999..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/__main__.py +++ /dev/null @@ -1,530 +0,0 @@ -import argparse -import json -import logging -import os -import sys -from functools import partial -from pathlib import Path -from typing import Union - -from lm_eval import evaluator, utils -from lm_eval.evaluator import request_caching_arg_to_dict -from lm_eval.loggers import EvaluationTracker, WandbLogger -from lm_eval.tasks import TaskManager -from lm_eval.utils import ( - handle_non_serializable, - make_table, - simple_parse_args_string, -) - - -def try_parse_json(value: str) -> Union[str, dict, None]: - if value is None: - return None - try: - return json.loads(value) - except json.JSONDecodeError: - if "{" in value: - raise argparse.ArgumentTypeError( - f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings." - ) - return value - - -def _int_or_none_list_arg_type( - min_len: int, max_len: int, defaults: str, value: str, split_char: str = "," -): - def parse_value(item): - item = item.strip().lower() - if item == "none": - return None - try: - return int(item) - except ValueError: - raise argparse.ArgumentTypeError(f"{item} is not an integer or None") - - items = [parse_value(v) for v in value.split(split_char)] - num_items = len(items) - - if num_items == 1: - # Makes downstream handling the same for single and multiple values - items = items * max_len - elif num_items < min_len or num_items > max_len: - raise argparse.ArgumentTypeError( - f"Argument requires {max_len} integers or None, separated by '{split_char}'" - ) - elif num_items != max_len: - logging.warning( - f"Argument requires {max_len} integers or None, separated by '{split_char}'. " - "Missing values will be filled with defaults." - ) - default_items = [parse_value(v) for v in defaults.split(split_char)] - items.extend( - default_items[num_items:] - ) # extend items list with missing defaults - - return items - - -def check_argument_types(parser: argparse.ArgumentParser): - """ - Check to make sure all CLI args are typed, raises error if not - """ - for action in parser._actions: - if action.dest != "help" and not action.const: - if action.type is None: - raise ValueError( - f"Argument '{action.dest}' doesn't have a type specified." - ) - else: - continue - - -def setup_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`" - ) - parser.add_argument( - "--tasks", - "-t", - default=None, - type=str, - metavar="task1,task2", - help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above", - ) - parser.add_argument( - "--model_args", - "-a", - default="", - type=try_parse_json, - help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""", - ) - parser.add_argument( - "--num_fewshot", - "-f", - type=int, - default=None, - metavar="N", - help="Number of examples in few-shot context", - ) - parser.add_argument( - "--batch_size", - "-b", - type=str, - default=1, - metavar="auto|auto:N|N", - help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.", - ) - parser.add_argument( - "--max_batch_size", - type=int, - default=None, - metavar="N", - help="Maximal batch size to try with --batch_size auto.", - ) - parser.add_argument( - "--device", - type=str, - default=None, - help="Device to use (e.g. cuda, cuda:0, cpu).", - ) - parser.add_argument( - "--output_path", - "-o", - default=None, - type=str, - metavar="DIR|DIR/file.json", - help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", - ) - parser.add_argument( - "--limit", - "-L", - type=float, - default=None, - metavar="N|0 argparse.Namespace: - check_argument_types(parser) - return parser.parse_args() - - -def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: - if not args: - # we allow for args to be passed externally, else we parse them ourselves - parser = setup_parser() - args = parse_eval_args(parser) - - if args.wandb_args: - wandb_args_dict = simple_parse_args_string(args.wandb_args) - wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args) - wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict) - - utils.setup_logging(args.verbosity) - eval_logger = logging.getLogger(__name__) - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - # update the evaluation tracker args with the output path and the HF token - if args.output_path: - args.hf_hub_log_args += f",output_path={args.output_path}" - if os.environ.get("HF_TOKEN", None): - args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}" - evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args) - evaluation_tracker = EvaluationTracker(**evaluation_tracker_args) - - if args.predict_only: - args.log_samples = True - if (args.log_samples or args.predict_only) and not args.output_path: - raise ValueError( - "Specify --output_path if providing --log_samples or --predict_only" - ) - - if args.fewshot_as_multiturn and args.apply_chat_template is False: - raise ValueError( - "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)." - ) - - if args.include_path is not None: - eval_logger.info(f"Including path: {args.include_path}") - metadata = ( - simple_parse_args_string(args.model_args) - if isinstance(args.model_args, str) - else args.model_args - if isinstance(args.model_args, dict) - else {} - ) | ( - args.metadata - if isinstance(args.metadata, dict) - else simple_parse_args_string(args.metadata) - ) - - task_manager = TaskManager(include_path=args.include_path, metadata=metadata) - - if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: - eval_logger.warning( - "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub." - ) - - if args.limit: - eval_logger.warning( - " --limit SHOULD ONLY BE USED FOR TESTING." - "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." - ) - if args.samples: - assert args.limit is None, ( - "If --samples is not None, then --limit must be None." - ) - if (samples := Path(args.samples)).is_file(): - args.samples = json.loads(samples.read_text()) - else: - args.samples = json.loads(args.samples) - - if args.tasks is None: - eval_logger.error("Need to specify task to evaluate.") - sys.exit() - elif args.tasks == "list": - print(task_manager.list_all_tasks()) - sys.exit() - elif args.tasks == "list_groups": - print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False)) - sys.exit() - elif args.tasks == "list_tags": - print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False)) - sys.exit() - elif args.tasks == "list_subtasks": - print(task_manager.list_all_tasks(list_groups=False, list_tags=False)) - sys.exit() - else: - if os.path.isdir(args.tasks): - import glob - - task_names = [] - yaml_path = os.path.join(args.tasks, "*.yaml") - for yaml_file in glob.glob(yaml_path): - config = utils.load_yaml_config(yaml_file) - task_names.append(config) - else: - task_list = args.tasks.split(",") - task_names = task_manager.match_tasks(task_list) - for task in [task for task in task_list if task not in task_names]: - if os.path.isfile(task): - config = utils.load_yaml_config(task) - task_names.append(config) - task_missing = [ - task for task in task_list if task not in task_names and "*" not in task - ] # we don't want errors if a wildcard ("*") task name was used - - if task_missing: - missing = ", ".join(task_missing) - eval_logger.error( - f"Tasks were not found: {missing}\n" - f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", - ) - raise ValueError( - f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues." - ) - - # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args - if args.trust_remote_code: - eval_logger.info( - "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`" - ) - # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally, - # because it's already been determined based on the prior env var before launching our - # script--`datasets` gets imported by lm_eval internally before these lines can update the env. - import datasets - - datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True - - args.model_args = args.model_args + ",trust_remote_code=True" - ( - eval_logger.info(f"Selected Tasks: {task_names}") - if eval_logger.getEffectiveLevel() >= logging.INFO - else print(f"Selected Tasks: {task_names}") - ) - - request_caching_args = request_caching_arg_to_dict( - cache_requests=args.cache_requests - ) - - results = evaluator.simple_evaluate( - model=args.model, - model_args=args.model_args, - tasks=task_names, - num_fewshot=args.num_fewshot, - batch_size=args.batch_size, - max_batch_size=args.max_batch_size, - device=args.device, - use_cache=args.use_cache, - limit=args.limit, - samples=args.samples, - check_integrity=args.check_integrity, - write_out=args.write_out, - log_samples=args.log_samples, - evaluation_tracker=evaluation_tracker, - system_instruction=args.system_instruction, - apply_chat_template=args.apply_chat_template, - fewshot_as_multiturn=args.fewshot_as_multiturn, - gen_kwargs=args.gen_kwargs, - task_manager=task_manager, - predict_only=args.predict_only, - random_seed=args.seed[0], - numpy_random_seed=args.seed[1], - torch_random_seed=args.seed[2], - fewshot_random_seed=args.seed[3], - confirm_run_unsafe_code=args.confirm_run_unsafe_code, - metadata=metadata, - **request_caching_args, - ) - - if results is not None: - if args.log_samples: - samples = results.pop("samples") - dumped = json.dumps( - results, indent=2, default=handle_non_serializable, ensure_ascii=False - ) - if args.show_config: - print(dumped) - - batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) - - # Add W&B logging - if args.wandb_args: - try: - wandb_logger.post_init(results) - wandb_logger.log_eval_result() - if args.log_samples: - wandb_logger.log_eval_samples(samples) - except Exception as e: - eval_logger.info(f"Logging to Weights and Biases failed due to {e}") - - evaluation_tracker.save_results_aggregated( - results=results, samples=samples if args.log_samples else None - ) - - if args.log_samples: - for task_name, config in results["configs"].items(): - evaluation_tracker.save_results_samples( - task_name=task_name, samples=samples[task_name] - ) - - if ( - evaluation_tracker.push_results_to_hub - or evaluation_tracker.push_samples_to_hub - ): - evaluation_tracker.recreate_metadata_card() - - print( - f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " - f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" - ) - print(make_table(results)) - if "groups" in results: - print(make_table(results, "groups")) - - if args.wandb_args: - # Tear down wandb run once all the logging is done. - wandb_logger.run.finish() - - -if __name__ == "__main__": - cli_evaluate() diff --git a/lm-evaluation-harness/lm_eval/api/filter.py b/lm-evaluation-harness/lm_eval/api/filter.py deleted file mode 100644 index 8d9db6821724c497c4a27116a1238e3b8d32ae29..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/filter.py +++ /dev/null @@ -1,56 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Callable, Iterable, List, Union - -from lm_eval.api.instance import Instance - - -class Filter(ABC): - """ - Filter classes operate on a per-task level. - They take all model outputs (`instance.resps` for all `task.instances`) - across all instances of a task, and perform operations. - In a single run, one can configure any number of separate filters or lists of filters. - - """ - - def __init__(self, **kwargs) -> None: - """ - Can define custom behavior here, if an individual instantiation of a Filter class should have state. - """ - - @abstractmethod - def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable: - """ - Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. - Should return the list of (filtered) response lists *in the same order as they were input*, e.g. - if pass in [, ] should return - [, ] - """ - return resps - - -@dataclass -class FilterEnsemble: - """ - FilterEnsemble creates a pipeline applying multiple filters. - Its intended usage is to stack multiple post-processing steps in order. - `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each - pipeline separately. - """ - - name: str - filters: List[Callable[[], Filter]] - - def apply(self, instances: List[Instance]) -> None: - resps, docs = zip(*((inst.resps, inst.doc) for inst in instances)) - resps, docs = list(resps), list(docs) - - for f in self.filters: - # apply filters in sequence - resps = f().apply(resps, docs) - - # add the end results after filtering to filtered_requests of their respective source instances. - # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. - for inst, resp in zip(instances, resps): - inst.filtered_resps[self.name] = resp diff --git a/lm-evaluation-harness/lm_eval/api/group.py b/lm-evaluation-harness/lm_eval/api/group.py deleted file mode 100644 index 0c60739bbd26c79ecab91f54240798b2ae9e3313..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/group.py +++ /dev/null @@ -1,115 +0,0 @@ -import abc -from dataclasses import asdict, dataclass -from inspect import getsource -from typing import Any, Callable, List, Optional, Union - - -@dataclass -class AggMetricConfig(dict): - metric: Optional[str] = None - aggregation: Optional[str] = "mean" - weight_by_size: Optional[str] = False - # list of filter names which should be incorporated into the aggregated metric. - filter_list: Optional[Union[str, list]] = "none" - - def __post_init__(self): - if self.aggregation != "mean" and not callable(self.aggregation): - raise ValueError( - f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'." - ) - - if isinstance(self.filter_list, str): - self.filter_list = [self.filter_list] - - -@dataclass -class GroupConfig(dict): - group: Optional[str] = None - group_alias: Optional[str] = None - task: Optional[Union[str, list]] = None - aggregate_metric_list: Optional[ - Union[List[AggMetricConfig], AggMetricConfig, dict] - ] = None - metadata: Optional[dict] = ( - None # by default, not used in the code. allows for users to pass arbitrary info to tasks - ) - - def __getitem__(self, item): - return getattr(self, item) - - def __setitem__(self, item, value): - return setattr(self, item, value) - - def __post_init__(self): - if self.aggregate_metric_list is not None: - if isinstance(self.aggregate_metric_list, dict): - self.aggregate_metric_list = [self.aggregate_metric_list] - - self.aggregate_metric_list = [ - AggMetricConfig(**item) if isinstance(item, dict) else item - for item in self.aggregate_metric_list - ] - - def to_dict(self, keep_callable: bool = False) -> dict: - """dumps the current config as a dictionary object, as a printable format. - null fields will not be printed. - Used for dumping results alongside full task configuration - - :return: dict - A printable dictionary version of the TaskConfig object. - - # TODO: should any default value in the TaskConfig not be printed? - """ - cfg_dict = asdict(self) - # remove values that are `None` - for k, v in list(cfg_dict.items()): - if callable(v): - cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable) - return cfg_dict - - def serialize_function( - self, value: Union[Callable, str], keep_callable=False - ) -> Union[Callable, str]: - """Serializes a given function or string. - - If 'keep_callable' is True, the original callable is returned. - Otherwise, attempts to return the source code of the callable using 'getsource'. - """ - if keep_callable: - return value - else: - try: - return getsource(value) - except (TypeError, OSError): - return str(value) - - -class ConfigurableGroup(abc.ABC): - def __init__( - self, - config: Optional[dict] = None, - ) -> None: - self._config = GroupConfig(**config) - - @property - def group(self): - return self._config.group - - @property - def group_alias(self): - return self._config.group_alias - - @property - def version(self): - return self._config.version - - @property - def config(self): - return self._config.to_dict() - - @property - def group_name(self) -> Any: - return self._config.group - - def __repr__(self): - return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})" diff --git a/lm-evaluation-harness/lm_eval/api/instance.py b/lm-evaluation-harness/lm_eval/api/instance.py deleted file mode 100644 index d3c6afa0644e729ba441728c72a2469fdad07b8f..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/instance.py +++ /dev/null @@ -1,38 +0,0 @@ -from dataclasses import dataclass, field -from typing import Literal, Optional, Tuple - - -OutputType = Literal[ - "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice" -] - - -@dataclass -class Instance: - request_type: OutputType - doc: dict - arguments: tuple - idx: int - metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field( - default_factory=lambda: (None, None, None) - ) - resps: list = field(default_factory=list) - filtered_resps: dict = field(default_factory=dict) - - # initialized after init - task_name: Optional[str] = None - doc_id: Optional[int] = None - repeats: Optional[int] = None - - def __post_init__(self) -> None: - # unpack metadata field - self.task_name, self.doc_id, self.repeats = self.metadata - - @property - def args(self): - """ - Returns (string,) where `string` is the string to calculate loglikelihood over - """ - return ( - self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) - ) diff --git a/lm-evaluation-harness/lm_eval/api/metrics.py b/lm-evaluation-harness/lm_eval/api/metrics.py deleted file mode 100644 index 61fca5e19d376502f3b75aa5328045cee6ee5454..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/metrics.py +++ /dev/null @@ -1,578 +0,0 @@ -import logging -import math -import random -import re -import string -from collections.abc import Iterable -from typing import List - -import numpy as np -import sacrebleu - -from lm_eval.api.registry import register_aggregation, register_metric - - -eval_logger = logging.getLogger(__name__) - - -# Register Aggregations First -@register_aggregation("bypass") -def bypass_agg(arr): - return 999 - - -@register_aggregation("nanmean") -def nanmean(arr): - if len(arr) == 0 or all(np.isnan(arr)): - return np.nan - return np.nanmean(arr) - - -@register_aggregation("mean") -def mean(arr): - return sum(arr) / len(arr) - - -@register_aggregation("median") -def median(arr): - return arr[len(arr) // 2] - - -# Certain metrics must be calculated across all documents in a benchmark. -# We use them as aggregation metrics, paired with no-op passthrough metric fns. -@register_aggregation("perplexity") -def perplexity(items): - return math.exp(-mean(items)) - - -@register_aggregation("weighted_perplexity") -def weighted_perplexity(items): - return math.exp(-weighted_mean(items)) - - -@register_aggregation("bits_per_byte") -def bits_per_byte(items): - return -weighted_mean(items) / math.log(2) - - -@register_aggregation("f1") -def f1_score(items): - from sklearn.metrics import f1_score - - unzipped_list = list(zip(*items)) - golds = unzipped_list[0] - preds = unzipped_list[1] - fscore = f1_score(golds, preds) - - return np.max(fscore) - - -@register_aggregation("matthews_corrcoef") -def matthews_corrcoef(items): - from sklearn.metrics import matthews_corrcoef - - unzipped_list = list(zip(*items)) - golds = unzipped_list[0] - preds = unzipped_list[1] - return matthews_corrcoef(golds, preds) - - -@register_aggregation("bleu") -def bleu(items): - """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric - for evaluating a generated sentence to a reference sentence. It counts matching - n-grams in the candidate translation to n-grams in the reference text, where - 1-gram or unigram would be each token and a bigram comparison would be each - word pair. The comparison is made regardless of word order - Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/ - Paper: https://www.aclweb.org/anthology/P02-1040/ - - Higher is better - """ - refs = list(zip(*items))[0] - preds = list(zip(*items))[1] - refs, preds = _sacreformat(refs, preds) - return sacrebleu.corpus_bleu(preds, refs).score - - -@register_aggregation("chrf") -def chrf(items): - """chrF++ is a tool for automatic evaluation of machine translation output - based on character n-gram precision and recall enhanced with word n-grams. - Source: https://github.com/m-popovic/chrF - Paper: https://www.aclweb.org/anthology/W15-3049.pdf - - Higher is better # TODO I think - """ - refs = list(zip(*items))[0] - preds = list(zip(*items))[1] - refs, preds = _sacreformat(refs, preds) - return sacrebleu.corpus_chrf(preds, refs).score - - -@register_aggregation("ter") -def ter(items): - """Translation Error Rate is an error metric for machine translation that - measures the number of edits required to change a system output into one - of the references - Source: http://www.cs.umd.edu/~snover/tercom/ - Paper: http://mt-archive.info/AMTA-2006-Snover.pdf - - Lower is better - """ - refs = list(zip(*items))[0] - preds = list(zip(*items))[1] - refs, preds = _sacreformat(refs, preds) - return sacrebleu.corpus_ter(preds, refs).score - - -@register_aggregation("brier_score") -def brier_score(items): # This is a passthrough function - gold, predictions = list(zip(*items)) - bs, num_class = np.array(predictions).shape - - gold = list(gold) - gold_one_hot = np.eye(num_class)[gold] - return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1)) - - -@register_metric( - metric="brier_score", - higher_is_better=False, - output_type=["multiple_choice"], - aggregation="brier_score", -) -def brier_score_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="acc", - higher_is_better=True, - output_type=["loglikelihood", "multiple_choice"], - aggregation="mean", -) -def acc_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="acc_norm", - higher_is_better=True, - output_type=["loglikelihood", "multiple_choice"], - aggregation="mean", -) -def acc_norm_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="acc_mutual_info", - higher_is_better=True, - output_type="multiple_choice", - aggregation="mean", -) -def acc_mutual_info_fn(items): # This is a passthrough function - return items - - -### the code used in the `exact_match_hf_evaluate` function is ported from -### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py -### which is under the apache license. - -# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - - -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -def exact_match_hf_evaluate( - predictions, - references, - regexes_to_ignore=None, - ignore_case=False, - ignore_punctuation=False, - ignore_numbers=False, -): - if regexes_to_ignore is not None: - for s in regexes_to_ignore: - predictions = np.array([re.sub(s, "", x) for x in predictions]) - references = np.array([re.sub(s, "", x) for x in references]) - else: - predictions = np.asarray(predictions) - references = np.asarray(references) - - if ignore_case: - predictions = np.char.lower(predictions) - references = np.char.lower(references) - - if ignore_punctuation: - repl_table = string.punctuation.maketrans("", "", string.punctuation) - predictions = np.char.translate(predictions, table=repl_table) - references = np.char.translate(references, table=repl_table) - - if ignore_numbers: - repl_table = string.digits.maketrans("", "", string.digits) - predictions = np.char.translate(predictions, table=repl_table) - references = np.char.translate(references, table=repl_table) - - score_list = predictions == references - - return {"exact_match": np.mean(score_list)} - - -### - - -@register_metric( - metric="exact_match", - higher_is_better=True, - output_type="generate_until", - aggregation="mean", -) -def exact_match_fn(**kwargs): - return exact_match_hf_evaluate(**kwargs) - - -@register_metric( - metric="perplexity", - higher_is_better=False, - output_type="loglikelihood", - aggregation="perplexity", -) -def perplexity_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="word_perplexity", - higher_is_better=False, - output_type="loglikelihood_rolling", - aggregation="weighted_perplexity", -) -def word_perplexity_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="byte_perplexity", - higher_is_better=False, - output_type="loglikelihood_rolling", - aggregation="weighted_perplexity", -) -def byte_perplexity_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="bits_per_byte", - higher_is_better=False, - output_type="loglikelihood_rolling", - aggregation="bits_per_byte", -) -def bits_per_byte_fn(items): # This is a passthrough function - return items - - -def pop_stddev(arr): - mu = mean(arr) - return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) - - -def sample_stddev(arr): - mu = mean(arr) - return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) - - -def mean_stderr(arr): - return sample_stddev(arr) / math.sqrt(len(arr)) - - -@register_metric( - metric="bypass", - higher_is_better=True, - output_type=["loglikelihood", "multiple_choice", "generate_until"], - aggregation="bypass", -) -def bypass(items): - return None - - -@register_metric( - metric="mcc", - higher_is_better=True, - output_type="multiple_choice", - aggregation="matthews_corrcoef", -) -def mcc_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="f1", - higher_is_better=True, - output_type="multiple_choice", - aggregation="f1", -) -def f1_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="bleu", - higher_is_better=True, - output_type="generate_until", - aggregation="bleu", -) -def bleu_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="chrf", - higher_is_better=True, - output_type="generate_until", - aggregation="chrf", -) -def chrf_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="ter", - higher_is_better=True, - output_type="generate_until", - aggregation="ter", -) -def ter_fn(items): # This is a passthrough function - return items - - -@register_metric( - metric="acc_all", - higher_is_better=True, - output_type="loglikelihood", - aggregation="mean", -) -def acc_all(items): - # Only count as correct if all answers are labeled correctly for each question - question_scoring_dict = {} - preds = list(zip(*items))[0] - docs = list(zip(*items))[1] - - for doc, pred in zip(docs, preds): - paragraph_id = doc["idx"]["paragraph"] - question_id = doc["idx"]["question"] - if (paragraph_id, question_id) not in question_scoring_dict: - question_scoring_dict[(paragraph_id, question_id)] = [] - - gold_label = doc["label"] == 1 - - question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred) - acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) - return acc - - -def acc_all_stderr(items): - # Only count as correct if all answers are labeled correctly for each question - question_scoring_dict = {} - preds = list(zip(*items))[0] - docs = list(zip(*items))[1] - - for doc, pred in zip(docs, preds): - question_id = doc["idx"]["question"] - if question_id not in question_scoring_dict: - question_scoring_dict[question_id] = [] - - gold_label = doc["label"] == 1 - question_scoring_dict[question_id].append(gold_label == pred) - - acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()]) - return acc - - -def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): - """Compute max metric between prediction and each ground truth.""" - scores_for_ground_truths = [] - for ground_truth in ground_truths: - score = metric_fn(prediction, ground_truth) - scores_for_ground_truths.append(score) - return max(scores_for_ground_truths) - - -def weighted_mean(items): - a, b = zip(*items) - return sum(a) / sum(b) - - -def is_non_str_iterable(obj): - return isinstance(obj, Iterable) and not isinstance(obj, str) - - -def _sacreformat(refs, preds): - """Format refs and preds for sacrebleu corpus calculation. It is very particular""" - # Sacrebleu expects (List[str], List[List[str]) - # e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...]) - - # Note [ref1_stream] is the first reference for each pred. - # So lists are size N and (M, N) for N preds and M possible refs for each pred - # This is a different order of dimensions that I would expect - - # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds - # Must become List[List[str]] with the inner list corresponding to preds - if not is_non_str_iterable(refs): - refs = list(refs) - if not is_non_str_iterable(refs[0]): - refs = [[ref] for ref in refs] - refs = list(zip(*refs)) - # Note the number of refs in each ref list much match the number of preds - - # We expect preds to be List[str] or List[List[str]]. Must become List[str] - if not is_non_str_iterable(preds): - preds = list(preds) - if is_non_str_iterable(preds[0]): - assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}" - preds = [pred[0] for pred in preds] - - return refs, preds - - -# stderr stuff - - -class _bootstrap_internal: - def __init__(self, f, n) -> None: - self.f = f - self.n = n - - def __call__(self, v): - i, xs = v - rnd = random.Random() - rnd.seed(i) - res = [] - for _ in range(self.n): - res.append(self.f(rnd.choices(xs, k=len(xs)))) - return res - - -def bootstrap_stderr(f, xs, iters): - import multiprocessing as mp - - pool = mp.Pool(mp.cpu_count()) - # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something - # equivalent to stderr calculated without Bessel's correction in the stddev. - # Unfortunately, I haven't been able to figure out what the right correction is - # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but - # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator) - # Thankfully, shouldn't matter because our samples are pretty big usually anyways - res = [] - chunk_size = min(1000, iters) - from tqdm import tqdm - - print("bootstrapping for stddev:", f.__name__) - for bootstrap in tqdm( - pool.imap( - _bootstrap_internal(f, chunk_size), - [(i, xs) for i in range(iters // chunk_size)], - ), - total=iters // chunk_size, - ): - # sample w replacement - res.extend(bootstrap) - - pool.close() - return sample_stddev(res) - - -def stderr_for_metric(metric, bootstrap_iters: int): - if bootstrap_iters <= 0: - # return no function (don't compute stderr) if bootstrap iters = 0 - return None - - bootstrappable = [ - median, - matthews_corrcoef, - f1_score, - perplexity, - bleu, - chrf, - ter, - nanmean, - ] - - if metric in bootstrappable: - return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters) - - stderr = {mean: mean_stderr, acc_all: acc_all_stderr} - - return stderr.get(metric, None) - - -def pooled_sample_stderr(stderrs: List[float], sizes: List[int]): - # Used to aggregate bootstrapped stderrs across subtasks in a group, - # when we are weighting by the size of each subtask. - # - - assert len(stderrs) == len(sizes) - - # formula source: https://en.wikipedia.org/wiki/Pooled_variance - # and: https://stats.stackexchange.com/a/4841331 - # this empirically seems to match running `stderr_for_metric` on all instances - # from the subtasks concatenated with each other. - pooled_sample_var = ( - sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)]) - ) / (sum(sizes) - len(sizes)) - - return np.sqrt(pooled_sample_var / sum(sizes)) - - -def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None): - assert metrics is not None, ( - "Need to pass a list of each subtask's metric for this stderr aggregation" - ) - assert len(stderrs) == len(sizes) and len(sizes) == len(metrics) - - # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation. - # This formula depends on sample means. - # removed because it seems to give erroneously huge stderrs for groupings of tasks - # and does not seem to match up with bootstrap-calculated stderrs for groups. - - ### don't use this unless a statistician has told you it's the right thing to do ### - - # accumulators: we'll aggregate pairwise N - 1 times - variance = stderrs[0] ** 2 - curr_size = sizes[0] - curr_score = metrics[0] - - for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]): - curr_score = ((curr_score * curr_size) + (score * size)) / ( - curr_size + size - ) # NOTE: this assumes our aggregation fn is "mean" - - variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / ( - curr_size + size - 1 - ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * ( - curr_score - score - ) ** 2 - - return np.sqrt(variance) - - -def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True): - # A helper function that is used to aggregate - # subtask scores cross-task. - # TODO: does not hold for non-mean aggregations - if not weight_by_size: - sizes = [1] * len(sizes) - - assert len(metrics) == len(sizes) - - return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes) diff --git a/lm-evaluation-harness/lm_eval/api/model.py b/lm-evaluation-harness/lm_eval/api/model.py deleted file mode 100644 index 1debc9b4da2ed307951e50bf2351af971937e59a..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/model.py +++ /dev/null @@ -1,493 +0,0 @@ -import abc -import hashlib -import json -import logging -import os -from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union - -import transformers -from sqlitedict import SqliteDict -from tqdm import tqdm - -from lm_eval import utils - - -eval_logger = logging.getLogger(__name__) - -T = TypeVar("T", bound="LM") - - -class LM(abc.ABC): - def __init__(self) -> None: - """Defines the interface that should be implemented by all LM subclasses. - LMs are assumed to take text (strings) as input and yield strings as output - (inputs/outputs should be tokenization-agnostic.) - - """ - # set rank and world size to a single process, by default. - self._rank = 0 - self._world_size = 1 - self.cache_hook = CacheHook(None) - - @abc.abstractmethod - def loglikelihood(self, requests) -> List[Tuple[float, bool]]: - """Compute log-likelihood of generating a continuation from a context. - Downstream tasks should attempt to use loglikelihood instead of other - LM calls whenever possible. - - :param requests: list[Instance] - A list of Instance objects, with property `args` which returns a tuple (context, continuation). - `context: str` - Context string. Implementations of LM must be able to handle an - empty context string. - `continuation: str` - The continuation over which log likelihood will be calculated. If - there is a word boundary, the space should be in the continuation. - For example, context="hello" continuation=" world" is correct. - - :return: list[tuple[float, bool]] - A list of pairs (logprob, isgreedy) - `logprob: float` - The log probability of `continuation`. - `isgreedy`: - Whether `continuation` would be generated by greedy sampling from `context`. - """ - pass - - @abc.abstractmethod - def loglikelihood_rolling(self, requests) -> List[float]: - """Compute full log-likelihood of a string, with no truncation, for perplexity computation - - We will use the full max context length of the model. - - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to - the max context length. - - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations - which may simply concatenate multiple documents together. - - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into - multiple chunks, the last input will still a full-sized context. - Example: - Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ] - Prefix: BOS/EOS - Max context length: 4 - Resulting input/prediction pairs: - - INPUT: BOS 0 1 2 - PRED: 0 1 2 3 - - INPUT: 3 4 5 6 - PRED: 4 5 6 7 - - INPUT: 5 6 7 8 - PRED: 8 9 - - Observe that: - 1. Each token is predicted exactly once - 2. For the last pair, we provide the full context, but only score the last two tokens - - :param requests: list[Instance] - A list of Instance objects with property `args` which returns a tuple (context,). - string: str - String for which we are computing overall loglikelihood - :return: list[tuple[float]] - A list of tuples (logprob,) - logprob: float - The log probability of `context` conditioned on the BOS/EOS token. - Can also be overridden for custom cases by `prefix_token_id`. - """ - pass - - # TODO: Add an optional max length - @abc.abstractmethod - def generate_until(self, requests) -> List[str]: - """Generate greedily until a stopping sequence - - :param requests: list[Instance] - A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs). - context: str - Context string - gen_kwargs: dict - A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc. - :return: list[str] - A list of model generated continuations. - continuation: str - The generated continuation. - """ - pass - - def apply_chat_template( - self, chat_history: List[Dict[str, str]], add_generation_prompt=True - ) -> str: - """ - Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM. - - :param chat_history: list[dict[str, str]] - A list of dictionaries with keys 'role' and 'content'. - Values are strings representing the role name and the content of the message, respectively. - :param add_generation_prompt: bool - Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message. - :return: str - A string representing the chat history in a format that can be used as input to the LM. - """ - raise NotImplementedError( - "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type." - ) - - @classmethod - def create_from_arg_string( - cls: Type[T], arg_string: str, additional_config: Optional[dict] = None - ) -> T: - """ - Creates an instance of the LM class using the given argument string and additional config. - - Parameters: - - arg_string: A string containing arguments in the format key1=value1,key2=value2. - - additional_config: Optional dictionary containing additional configuration parameters. - - Returns: - - Instance of the LM class. - """ - additional_config = {} if additional_config is None else additional_config - args = utils.simple_parse_args_string(arg_string) - args2 = {k: v for k, v in additional_config.items() if v is not None} - return cls(**args, **args2) - - @classmethod - def create_from_arg_obj( - cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None - ) -> T: - """ - Creates an instance of the LM class using the given arg_obj - - Parameters: - - arg_obj: A dict containing arguments in the format key1=value1,key2=value2. - - additional_config: Optional dictionary containing additional configuration parameters. - - Returns: - - Instance of the LM class. - """ - - additional_config = {} if additional_config is None else additional_config - additional_config = { - k: v for k, v in additional_config.items() if v is not None - } - - return cls(**arg_dict, **additional_config) - - @property - def rank(self): - # used in the case of parallelism. Hardcoded to - # ensure no errors arise using API models which do - # not support multi-device parallelism nor expect it. - return self._rank - - @property - def world_size(self): - # used in the case of parallelism. Hardcoded to - # ensure no errors arise using API models which do - # not support multi-device parallelism nor expect it. - return self._world_size - - @property - def tokenizer_name(self) -> str: - """Must be defined for LM subclasses which implement Chat Templating. - Should return the name of the tokenizer or chat template used. - Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used. - """ - raise NotImplementedError( - "To use this model with chat templates, please implement the 'tokenizer_name' property." - ) - - def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: - """Returns the chat template structure for user/assistant messages if a template is provided. - This method is intended to be overridden in a subclass to define a specific chat template format. - For models that do not support chat templates, this method returns None by default. - """ - - return "" - - def set_cache_hook(self, cache_hook) -> None: - self.cache_hook = cache_hook - - -### SQLite-based caching of LM responses -def hash_args(attr, args): - dat = json.dumps([attr] + list(args)) - return hashlib.sha256(dat.encode("utf-8")).hexdigest() - - -class CacheHook: - def __init__(self, cachinglm) -> None: - if cachinglm is None: - self.dbdict = None - return - - self.dbdict = cachinglm.dbdict - - def add_partial(self, attr, req, res) -> None: - if self.dbdict is None: - return - hsh = hash_args(attr, req) - self.dbdict[hsh] = res - - -class CachingLM: - def __init__(self, lm, cache_db) -> None: - """LM wrapper that returns cached results if they exist, and uses the underlying LM if not. - - :param lm: LM - Underlying LM - :param cache_db: str - Path to cache db - """ - self.lm = lm - self.cache_db = cache_db - if os.path.dirname(cache_db): - os.makedirs(os.path.dirname(cache_db), exist_ok=True) - self.dbdict = SqliteDict(cache_db, autocommit=True) - - # add hook to lm - lm.set_cache_hook(self.get_cache_hook()) - - def __getattr__(self, attr: str): - lm_attr = getattr(self.lm, attr) - if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]: - eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") - return lm_attr - - def fn(requests): - res = [] - remaining_reqs = [] - warned = False - # figure out which ones are cached and which ones are new - eval_logger.info( - f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..." - ) - for req in tqdm(requests, desc="Checking cached requests"): - hsh = hash_args(attr, req.args) - if attr == "generate_until" and req.args[1].get("do_sample", False): - # when we are doing non-greedy generation, don't use the cache - # (else every "randomly sampled" generation would be identical for repeats > 1). - if not warned: - eval_logger.warning( - f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests." - ) - warned = True - res.append(None) - remaining_reqs.append(req) - elif hsh in self.dbdict: - ob = self.dbdict[hsh] - - assert ob is not None - - res.append(ob) - else: - res.append(None) - remaining_reqs.append(req) - eval_logger.info( - f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}" - ) - if remaining_reqs: - # actually run the LM on the requests that do not have cached results - rem_res = getattr(self.lm, attr)(remaining_reqs) - else: - rem_res = [] - - # stick the new ones back into the list and also cache any of the new ones - resptr = 0 - for req, r in zip(remaining_reqs, rem_res): - while res[resptr] is not None: - resptr += 1 - - res[resptr] = r - - # caching - hsh = hash_args(attr, req.args) - self.dbdict[hsh] = r - self.dbdict.commit() - - return res - - return fn - - def get_cache_hook(self): - return CacheHook(self) - - -class TemplateLM(LM): - """ - A class acting as intermediary between the LM base class - and boilerplate often included in other LM subclasses. - """ - - tokenizer = None - - @property - @abc.abstractmethod - def eot_token_id(self): - pass - - @property - def prefix_token_id(self): - # it is used as prefix for loglikelihood - return self.eot_token_id - - @abc.abstractmethod - def tok_encode(self, string: str, **kwargs) -> List[int]: - """ - Tokenize a string using the model's tokenizer and return a list of token IDs. - """ - pass - - @abc.abstractmethod - def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: - pass - - def _encode_pair( - self, context: str, continuation: str - ) -> Tuple[List[int], List[int]]: - n_spaces = len(context) - len(context.rstrip()) - if n_spaces > 0: - continuation = context[-n_spaces:] + continuation - context = context[:-n_spaces] - - model_class = getattr(self, "AUTO_MODEL_CLASS", None) - - if model_class == transformers.AutoModelForSeq2SeqLM: - context_enc = self.tok_encode(context) - continuation_enc = self.tok_encode(continuation, add_special_tokens=False) - else: - whole_enc = self.tok_encode(context + continuation) - context_enc = self.tok_encode(context) - - context_enc_len = len(context_enc) - continuation_enc = whole_enc[context_enc_len:] - - return context_enc, continuation_enc - - def loglikelihood( - self, requests, disable_tqdm: bool = False - ) -> List[Tuple[float, bool]]: - new_reqs = [] - for context, continuation in [req.args for req in requests]: - if context == "": - # BOS or EOS as context - context_enc, continuation_enc = ( - [self.prefix_token_id], - self.tok_encode(continuation), - ) - else: - context_enc, continuation_enc = self._encode_pair(context, continuation) - - new_reqs.append(((context, continuation), context_enc, continuation_enc)) - - return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm) - - @abc.abstractmethod - def loglikelihood_rolling( - self, requests, disable_tqdm: bool = False - ) -> List[float]: - pass - - @abc.abstractmethod - def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: - pass - - def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]: - """ - Set and get the appropriate chat template for the model. - This method sets the tokenizer's chat_template and returns the template string for reproducibility. - - The template selection logic is adapted from the Transformers library's `apply_chat_template` - method in the Tokenizer class. The original implementation can be found at: - https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687 - - This method ensures that the right template is chosen based on the following: - 0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string. - 1. If the model's tokenizer has multiple templates: - a. Use the specified template if it exists in the dictionary. - b. Use the default template from the list if no specific template is provided. - c. Raise an error if no default template exists and no specific template is provided. - 2. If the model's tokenizer has a single template or no template: - a. Use the tokenizer's chat template if available. - b. Fall back to the default chat template if no tokenizer chat template exists. - - Args: - chat_template (Union[bool, str]): Specifies the chat template to use. - - If False or None, no template is applied. - - If True, the default or only available template is used. - - If a string, the template with the matching name is used. - - Returns: - Optional[str]: The selected chat template, or None if no template is applied. - """ - if self.tokenizer is None: - return "" - - if chat_template is False or chat_template is None: - eval_logger.warning( - "model.chat_template was called with the chat_template set to False or None. " - "Therefore no chat template will be applied. Make sure this is an intended behavior." - ) - return None - - # Convert boolean chat_template to None to ensure compatibility with the adapted logic - if isinstance(chat_template, bool): - chat_template = None - using_default_template = False - - # First, handle the cases when the model has a dict of multiple templates - try: - template = ( - self.tokenizer.chat_template or self.tokenizer.default_chat_template - ) - except AttributeError: - return None - - if isinstance(template, dict): - using_default_dict = self.tokenizer.chat_template is None - - if chat_template is not None: - if chat_template in template: - selected_template = template[chat_template] - if using_default_dict: - using_default_template = True - else: - raise ValueError( - f"The specified chat template '{chat_template}' is not available. " - f"Available template names are {sorted(template.keys())}." - ) - else: - # If user didn't pass a chat template, use the default template from the dict - if "default" in template: - selected_template = template["default"] - using_default_template = True - else: - raise ValueError( - "This model has multiple chat templates with no default specified! Please either pass a chat " - "template or the name of the template you wish to use to the `chat_template` argument. Available " - f"template names are {sorted(template.keys())}." - ) - - # Cases when the model has a single template or no template - else: - # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template - if isinstance(chat_template, str): - eval_logger.warning( - "Chat template name provided, but the tokenizer's chat template is not a dictionary. " - "Using the tokenizer's chat template or the default template instead." - ) - if self.tokenizer.chat_template is not None: - selected_template = self.tokenizer.chat_template - else: - selected_template = self.tokenizer.default_chat_template - using_default_template = True - - if using_default_template: - eval_logger.warning( - "No chat template is set for this tokenizer, falling back to a default class-level template. This is " - "very error-prone, because models are often trained with templates different from the class default! " - "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " - "point any code depending on them will stop working. We recommend setting a valid chat template before " - "then to ensure that this model continues working without issues." - ) - - return selected_template diff --git a/lm-evaluation-harness/lm_eval/api/registry.py b/lm-evaluation-harness/lm_eval/api/registry.py deleted file mode 100644 index 4673b157b1fc1eaed2eb40e7a1ad527ce1fcb595..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/registry.py +++ /dev/null @@ -1,196 +0,0 @@ -import logging -from typing import Callable, Dict, Union - -import evaluate as hf_evaluate - -from lm_eval.api.model import LM - - -eval_logger = logging.getLogger(__name__) - -MODEL_REGISTRY = {} - - -def register_model(*names): - # either pass a list or a single alias. - # function receives them as a tuple of strings - - def decorate(cls): - for name in names: - assert issubclass(cls, LM), ( - f"Model '{name}' ({cls.__name__}) must extend LM class" - ) - - assert name not in MODEL_REGISTRY, ( - f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead." - ) - - MODEL_REGISTRY[name] = cls - return cls - - return decorate - - -def get_model(model_name): - try: - return MODEL_REGISTRY[model_name] - except KeyError: - raise ValueError( - f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}" - ) - - -TASK_REGISTRY = {} -GROUP_REGISTRY = {} -ALL_TASKS = set() -func2task_index = {} - - -def register_task(name): - def decorate(fn): - assert name not in TASK_REGISTRY, ( - f"task named '{name}' conflicts with existing registered task!" - ) - - TASK_REGISTRY[name] = fn - ALL_TASKS.add(name) - func2task_index[fn.__name__] = name - return fn - - return decorate - - -def register_group(name): - def decorate(fn): - func_name = func2task_index[fn.__name__] - if name in GROUP_REGISTRY: - GROUP_REGISTRY[name].append(func_name) - else: - GROUP_REGISTRY[name] = [func_name] - ALL_TASKS.add(name) - return fn - - return decorate - - -OUTPUT_TYPE_REGISTRY = {} -METRIC_REGISTRY = {} -METRIC_AGGREGATION_REGISTRY = {} -AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {} -HIGHER_IS_BETTER_REGISTRY = {} -FILTER_REGISTRY = {} - -DEFAULT_METRIC_REGISTRY = { - "loglikelihood": [ - "perplexity", - "acc", - ], - "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"], - "multiple_choice": ["acc", "acc_norm"], - "generate_until": ["exact_match"], -} - - -def register_metric(**args): - # TODO: do we want to enforce a certain interface to registered metrics? - def decorate(fn): - assert "metric" in args - name = args["metric"] - - for key, registry in [ - ("metric", METRIC_REGISTRY), - ("higher_is_better", HIGHER_IS_BETTER_REGISTRY), - ("aggregation", METRIC_AGGREGATION_REGISTRY), - ]: - if key in args: - value = args[key] - assert value not in registry, ( - f"{key} named '{value}' conflicts with existing registered {key}!" - ) - - if key == "metric": - registry[name] = fn - elif key == "aggregation": - registry[name] = AGGREGATION_REGISTRY[value] - else: - registry[name] = value - - return fn - - return decorate - - -def get_metric(name: str, hf_evaluate_metric=False) -> Callable: - if not hf_evaluate_metric: - if name in METRIC_REGISTRY: - return METRIC_REGISTRY[name] - else: - eval_logger.warning( - f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..." - ) - - try: - metric_object = hf_evaluate.load(name) - return metric_object.compute - except Exception: - eval_logger.error( - f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric", - ) - - -def register_aggregation(name: str): - def decorate(fn): - assert name not in AGGREGATION_REGISTRY, ( - f"aggregation named '{name}' conflicts with existing registered aggregation!" - ) - - AGGREGATION_REGISTRY[name] = fn - return fn - - return decorate - - -def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]: - try: - return AGGREGATION_REGISTRY[name] - except KeyError: - eval_logger.warning(f"{name} not a registered aggregation metric!") - - -def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]: - try: - return METRIC_AGGREGATION_REGISTRY[name] - except KeyError: - eval_logger.warning(f"{name} metric is not assigned a default aggregation!") - - -def is_higher_better(metric_name) -> bool: - try: - return HIGHER_IS_BETTER_REGISTRY[metric_name] - except KeyError: - eval_logger.warning( - f"higher_is_better not specified for metric '{metric_name}'!" - ) - - -def register_filter(name): - def decorate(cls): - if name in FILTER_REGISTRY: - eval_logger.info( - f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}" - ) - FILTER_REGISTRY[name] = cls - return cls - - return decorate - - -def get_filter(filter_name: Union[str, Callable]) -> Callable: - try: - return FILTER_REGISTRY[filter_name] - except KeyError as e: - if callable(filter_name): - return filter_name - else: - eval_logger.warning(f"filter `{filter_name}` is not registered!") - raise e diff --git a/lm-evaluation-harness/lm_eval/api/samplers.py b/lm-evaluation-harness/lm_eval/api/samplers.py deleted file mode 100644 index 5d1791bdb4f8ae06cf4168dcdfa4c6a5a9bbc823..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/samplers.py +++ /dev/null @@ -1,232 +0,0 @@ -import logging -import warnings -from functools import partial -from typing import TYPE_CHECKING, Iterable, Optional, Union - -import datasets - - -if TYPE_CHECKING: - from random import Random - - from lm_eval.api.task import ConfigurableTask, Task - -eval_logger = logging.getLogger("lm-eval") - - -class ContextSampler: - def __init__( - self, - docs: list[dict], - task: Union["Task", "ConfigurableTask"], - fewshot_indices: Optional[Iterable] = None, - rnd: Optional["Random"] = None, - ) -> None: - self.rnd = rnd - if not self.rnd: - raise ValueError( - "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!" - ) - - self.task = task - self.config = task._config - - self.target_delimiter = self.config.target_delimiter - self.fewshot_delimiter = self.config.fewshot_delimiter - - if ( - self.config.fewshot_config is not None - and self.config.fewshot_config.get("doc_to_text", None) is not None - ): - self.doc_to_text = partial( - self.task.doc_to_text, - doc_to_text=self.config.fewshot_config.get("doc_to_text", None), - ) - else: - self.doc_to_text = self.task.doc_to_text - - if ( - self.config.fewshot_config is not None - and self.config.fewshot_config.get("doc_to_target", None) is not None - ): - self.doc_to_target = partial( - self.task.doc_to_target, - doc_to_target=self.config.fewshot_config.get("doc_to_target", None), - ) - else: - self.doc_to_target = self.task.doc_to_target - - if ( - self.config.fewshot_config is not None - and self.config.fewshot_config.get("doc_to_choice", None) is not None - ): - self.doc_to_choice = partial( - self.task.doc_to_choice, - doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None), - ) - else: - self.doc_to_choice = self.task.doc_to_choice - - self.docs = docs # HF dataset split, provided by task._fewshot_docs() - if fewshot_indices: # subset few-shot docs from - if not isinstance(self.docs, datasets.Dataset): - raise ValueError( - "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously" - ) - self.docs = self.docs.select(fewshot_indices) - - def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None): - # draw an extra fewshot sample if using same split as evaluating on - prefix = gen_prefix + " " if gen_prefix else "" - n_samples = ( - num_fewshot + 1 - if self.config.fewshot_split == self.config.test_split - else num_fewshot - ) - - # draw `n_samples` docs from fewshot_docs - fewshotex = self.sample(n_samples) - - # get rid of the doc that's the one we're evaluating, if it's in the fewshot - # TODO: should we just stop people from using fewshot from same split as evaluating? - selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] - - labeled_examples = "" - for doc in selected_docs: - doc_content = self.doc_to_text(doc) - doc_target = self.doc_to_target(doc) - if self.config.doc_to_choice is None or isinstance(doc_content, str): - labeled_examples += doc_content - else: - labeled_examples += self.doc_to_choice(doc)[doc_content] - - if doc_target != "": - if self.target_delimiter.isspace() and str(doc_target)[0].isspace(): - # TODO: add logger warn once here. - warnings.warn( - "Both target_delimiter and target start with a space. This may cause issues.", - Warning, - stacklevel=2, - ) - labeled_examples += self.target_delimiter - labeled_examples += prefix - labeled_examples += ( - str(doc_target[0]) - if isinstance(doc_target, list) - else doc_target - if self.config.doc_to_choice is None or isinstance(doc_target, str) - else str(self.doc_to_choice(doc)[doc_target]) - ) - labeled_examples += self.fewshot_delimiter - - return labeled_examples - - def get_chat_context( - self, - doc: dict, - num_fewshot: int, - fewshot_as_multiturn: bool = False, - gen_prefix: Optional[str] = None, - ): - # TODO: Do we need any other delimiter - prefix = gen_prefix + " " if gen_prefix else "" - chat_history = [] - # draw an extra fewshot sample if using same split as evaluating on - n_samples = ( - num_fewshot + 1 - if self.config.fewshot_split == self.config.test_split - else num_fewshot - ) - # draw `n_samples` docs from fewshot_docs - fewshotex = self.sample(n_samples) - - # get rid of the doc that's the one we're evaluating, if it's in the fewshot - # TODO: should we just stop people from using fewshot from same split as evaluating? - selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] - - if fewshot_as_multiturn: - for doc in selected_docs: - doc_content = self.doc_to_text(doc) - doc_target = self.doc_to_target(doc) - chat_history.append( - { - "role": "user", - "content": doc_content - if self.config.doc_to_choice is None - or isinstance(doc_content, str) - else self.doc_to_choice(doc)[doc_content], - } - ) - chat_history.append( - { - "role": "assistant", - "content": prefix + str(doc_target[0]) - if isinstance(doc_target, list) - else prefix + doc_target - if self.config.doc_to_choice is None - or isinstance(doc_target, str) - else prefix + str(self.doc_to_choice(doc)[doc_target]), - } - ) - else: - # get fewshot context as one user turn - chat_history.append( - { - "role": "user", - "content": self.get_context( - doc, num_fewshot, gen_prefix=gen_prefix - ), - } - ) - - return chat_history - - def sample(self, n: int): - """ - Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. - """ - - return self.rnd.sample(self.docs, n) - - -class FirstNSampler(ContextSampler): - def sample(self, n: int) -> None: - """ - Draw the first `n` samples in order from the specified split. - Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. - """ - assert n <= len(self.docs), ( - f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available." - ) - return self.docs[:n] - - -class BalancedSampler(ContextSampler): - def sample(self, n: int) -> None: - """ - TODO: this should return approximately class-balanced samples from our fewshot examples. - TODO: what order should they be in? maybe random? - """ - - pass - - -class ManualSampler(ContextSampler): - def sample(self, n: int) -> None: - """ """ - pass - - -SAMPLER_REGISTRY = { - "default": ContextSampler, - "first_n": FirstNSampler, -} - - -def get_sampler(name: str): - try: - return SAMPLER_REGISTRY[name] - except KeyError: - raise ValueError( - f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}" - ) diff --git a/lm-evaluation-harness/lm_eval/api/task.py b/lm-evaluation-harness/lm_eval/api/task.py deleted file mode 100644 index ad334b48b0ed80575326a09f04298a296838842d..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/api/task.py +++ /dev/null @@ -1,1879 +0,0 @@ -import abc -import ast -import logging -import random -import re -from collections.abc import Callable -from copy import deepcopy -from dataclasses import asdict, dataclass -from inspect import getsource -from typing import ( - Any, - Dict, - Iterable, - Iterator, - List, - Literal, - Mapping, - Optional, - Tuple, - Union, -) - -import datasets -import numpy as np -from tqdm import tqdm - -from lm_eval import utils -from lm_eval.api import samplers -from lm_eval.api.instance import Instance, OutputType -from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity -from lm_eval.api.registry import ( - AGGREGATION_REGISTRY, - DEFAULT_METRIC_REGISTRY, - get_aggregation, - get_metric, - get_metric_aggregation, - is_higher_better, -) -from lm_eval.caching.cache import load_from_cache, save_to_cache -from lm_eval.filters import build_filter_ensemble -from lm_eval.prompts import get_prompt - - -ALL_OUTPUT_TYPES = [ - "loglikelihood", - "multiple_choice", - "loglikelihood_rolling", - "generate_until", -] - -eval_logger = logging.getLogger(__name__) - - -@dataclass -class TaskConfig(dict): - # task naming/registry - task: Optional[str] = None - task_alias: Optional[str] = None - tag: Optional[Union[str, list]] = None - # HF dataset options. - # which dataset to use, - # and what splits for what purpose - custom_dataset: Optional[Callable] = None - dataset_path: Optional[str] = None - dataset_name: Optional[str] = None - dataset_kwargs: Optional[dict] = None - training_split: Optional[str] = None - validation_split: Optional[str] = None - test_split: Optional[str] = None - fewshot_split: Optional[str] = ( - None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?) - ) - # formatting / prompting options. - # see docs/advanced_task_guide.md for more info - process_docs: Optional[Callable] = None - doc_to_text: Optional[Union[Callable, str]] = None - doc_to_target: Optional[Union[Callable, str]] = None - doc_to_image: Union[Callable, str] = None - doc_to_audio: Union[Callable, str] = None - unsafe_code: bool = False - doc_to_choice: Optional[Union[Callable, str, dict, list]] = None - process_results: Optional[Union[Callable, str]] = None - use_prompt: Optional[str] = None - description: str = "" - target_delimiter: str = " " - fewshot_delimiter: str = "\n\n" - fewshot_config: Optional[dict] = None - # runtime configuration options - num_fewshot: Optional[int] = None - # scoring options - metric_list: Optional[list] = None - output_type: OutputType = "generate_until" - generation_kwargs: Optional[dict] = None - repeats: int = 1 - filter_list: Optional[Union[str, list]] = None - should_decontaminate: bool = False - doc_to_decontamination_query: Optional[str] = None - gen_prefix: Optional[str] = None - metadata: Optional[dict] = ( - None # by default, not used in the code. allows for users to pass arbitrary info to tasks - ) - - def __post_init__(self) -> None: - if self.generation_kwargs is not None: - if self.output_type != "generate_until": - eval_logger.warning( - f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!" - ) - - if "temperature" in self.generation_kwargs: - self.generation_kwargs["temperature"] = float( - self.generation_kwargs["temperature"] - ) - - if "until" not in self.generation_kwargs: - eval_logger.warning( - f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}" - ) - self.generation_kwargs["until"] = [self.fewshot_delimiter] - else: - if self.output_type == "generate_until": - # ensure that we greedily generate in absence of explicit arguments otherwise - self.generation_kwargs = { - "until": ( - None - if self.fewshot_delimiter is None - else [self.fewshot_delimiter] - ), - "do_sample": False, - "temperature": 0, - } - eval_logger.warning( - f"{self.task}: No `generation_kwargs` specified in task config, defaulting to {self.generation_kwargs}" - ) - - def __getitem__(self, item): - return getattr(self, item) - - def __setitem__(self, item, value): - return setattr(self, item, value) - - def to_dict(self, keep_callable: bool = False) -> dict: - """dumps the current config as a dictionary object, as a printable format. - null fields will not be printed. - Used for dumping results alongside full task configuration - - :return: dict - A printable dictionary version of the TaskConfig object. - - # TODO: should any default value in the TaskConfig not be printed? - """ - cfg_dict = asdict(self) - # remove values that are `None` - for k, v in list(cfg_dict.items()): - if v is None: - cfg_dict.pop(k) - elif k == "metric_list": - for metric_dict in v: - for metric_key, metric_value in metric_dict.items(): - if callable(metric_value): - metric_dict[metric_key] = self.serialize_function( - metric_value, keep_callable=keep_callable - ) - cfg_dict[k] = v - elif callable(v): - cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable) - return cfg_dict - - def serialize_function( - self, value: Union[Callable, str], keep_callable=False - ) -> Union[Callable, str]: - """Serializes a given function or string. - - If 'keep_callable' is True, the original callable is returned. - Otherwise, attempts to return the source code of the callable using 'getsource'. - """ - if keep_callable: - return value - else: - try: - return getsource(value) - except (TypeError, OSError): - return str(value) - - -class Task(abc.ABC): - """A task represents an entire benchmark including its dataset, problems, - answers, and evaluation methods. See BoolQ for a simple example implementation - - A `doc` can be any python object which represents one instance of evaluation. - This is usually a dictionary e.g. - {"question": ..., "answer": ...} or - {"question": ..., question, answer) - """ - - VERSION: Optional[Union[int, str]] = None - - # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub - # or a path to a custom `datasets` loading script. - DATASET_PATH: Optional[str] = None - - # The name of a subset within `DATASET_PATH`. - DATASET_NAME: Optional[str] = None - - OUTPUT_TYPE: Optional[OutputType] = None - - def __init__( - self, - data_dir: Optional[str] = None, - cache_dir: Optional[str] = None, - download_mode: Optional[datasets.DownloadMode] = None, - config: Optional[Mapping] = None, # Union[dict, TaskConfig] - ) -> None: - """ - :param data_dir: str - Stores the path to a local folder containing the `Task`'s data files. - Use this to specify the path to manually downloaded data (usually when - the dataset is not publicly accessible). - :param cache_dir: str - The directory to read/write the `Task` dataset. This follows the - HuggingFace `datasets` API with the default cache directory located at: - `~/.cache/huggingface/datasets` - NOTE: You can change the cache location globally for a given process - to another directory: - `export HF_DATASETS_CACHE="/path/to/another/directory"` - :param download_mode: datasets.DownloadMode - How to treat pre-existing `Task` downloads and data. - - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` - Reuse download and reuse dataset. - - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` - Reuse download with fresh dataset. - - `datasets.DownloadMode.FORCE_REDOWNLOAD` - Fresh download and fresh dataset. - """ - self.download(data_dir, cache_dir, download_mode) - self._training_docs: Optional[list] = None - self._fewshot_docs: Optional[list] = None - self._instances: Optional[List[Instance]] = None - - self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig() - - self._filters = [build_filter_ensemble("none", [["take_first", None]])] - self.fewshot_rnd: Optional[random.Random] = ( - None # purposely induce errors in case of improper usage - ) - - def download( - self, - data_dir: Optional[str] = None, - cache_dir: Optional[str] = None, - download_mode=None, - ) -> None: - """Downloads and returns the task dataset. - Override this method to download the dataset from a custom API. - - :param data_dir: str - Stores the path to a local folder containing the `Task`'s data files. - Use this to specify the path to manually downloaded data (usually when - the dataset is not publicly accessible). - :param cache_dir: str - The directory to read/write the `Task` dataset. This follows the - HuggingFace `datasets` API with the default cache directory located at: - `~/.cache/huggingface/datasets` - NOTE: You can change the cache location globally for a given process - by setting the shell environment variable, `HF_DATASETS_CACHE`, - to another directory: - `export HF_DATASETS_CACHE="/path/to/another/directory"` - :param download_mode: datasets.DownloadMode - How to treat pre-existing `Task` downloads and data. - - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` - Reuse download and reuse dataset. - - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` - Reuse download with fresh dataset. - - `datasets.DownloadMode.FORCE_REDOWNLOAD` - Fresh download and fresh dataset. - """ - self.dataset = datasets.load_dataset( - path=self.DATASET_PATH, - name=self.DATASET_NAME, - data_dir=data_dir, - cache_dir=cache_dir, - download_mode=download_mode, - ) - - @property - def config(self) -> TaskConfig: - """Returns the TaskConfig associated with this class.""" - return self._config - - @abc.abstractmethod - def has_training_docs(self): - """Whether the task has a training set""" - pass - - @abc.abstractmethod - def has_validation_docs(self): - """Whether the task has a validation set""" - pass - - @abc.abstractmethod - def has_test_docs(self): - """Whether the task has a test set""" - pass - - def training_docs(self) -> Iterable: - """ - :return: Iterable[obj] - A iterable of any object, that doc_to_text can handle - """ - return [] - - def validation_docs(self) -> Iterable: - """ - :return: Iterable[obj] - A iterable of any object, that doc_to_text can handle - """ - return [] - - def test_docs(self) -> Iterable: - """ - :return: Iterable[obj] - A iterable of any object, that doc_to_text can handle - """ - return [] - - def fewshot_docs(self) -> Iterable: - """ - :return: Iterable[obj] - A iterable of any object, that doc_to_text can handle - """ - if self.has_training_docs(): - return self.training_docs() - elif self.has_validation_docs(): - return self.validation_docs() - else: - if self.config.get("num_fewshot", 0) > 0: - eval_logger.warning( - f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False" - ", using test_docs as fewshot_docs but this is not recommended." - ) - return self.test_docs() - - def _process_doc(self, doc: dict) -> dict: - """ - Override this to process (detokenize, strip, replace, etc.) individual - documents. This can be used in a map over documents of a data split. - E.g. `map(self._process_doc, self.dataset["validation"])` - - :return: dict - The processed version of the specified `doc`. - """ - return doc - - @property - def instances(self) -> List[Instance]: - """After calling `task.build_all_requests()`, tasks - maintain a list of the dataset instances which will be evaluated. - """ - return self._instances - - def fewshot_examples(self, k, rnd): - if self._training_docs is None: - self._training_docs = list(self.training_docs()) - - return rnd.sample(self._training_docs, k) - - def doc_to_decontamination_query(self, doc): - raise NotImplementedError( - "Override doc_to_decontamination_query with document specific decontamination query." - ) - - @abc.abstractmethod - def doc_to_text(self, doc): - pass - - @abc.abstractmethod - def doc_to_target(self, doc): - pass - - # not an abstractmethod because not every language-only task has to implement this - def doc_to_image(self, doc): - raise NotImplementedError - - def doc_to_audio(self, doc): - raise NotImplementedError - - def doc_to_prefix(self, doc): - return "" - - def build_all_requests( - self, - *, - limit: Union[int, None] = None, - samples: Optional[List[int]] = None, - rank: int = 0, - world_size: int = 1, - cache_requests: bool = False, - rewrite_requests_cache: bool = False, - system_instruction: Optional[str] = None, - apply_chat_template: bool = False, - fewshot_as_multiturn: bool = False, - chat_template: Optional[Callable] = None, - tokenizer_name: str = "", - ) -> None: - """Build a set of Instances for a task, and store them in task.instances""" - - # used with caching - og_limit = limit - - cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}" - cache_key += "-chat_template" if apply_chat_template else "" - cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else "" - cache_key += ( - f"-system_prompt_hash{utils.hash_string(system_instruction)}" - if system_instruction is not None - else "" - ) - cache_key += f"-tokenizer{tokenizer_name}" - - cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests) - - if cache_requests and cached_instances and not rewrite_requests_cache: - cached_instances = cached_instances[:limit] - - flattened_instances = [ - instance - for instance_group in cached_instances - for instance in instance_group - ] - - self._instances = flattened_instances - return - - eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...") - - instances = [] - - # process all documents when caching is specified for simplicity - if ( - cache_requests - and (not cached_instances or rewrite_requests_cache) - and limit is not None - ): - limit = None - - doc_id_docs = list( - self.doc_iterator( - rank=rank, limit=limit, samples=samples, world_size=world_size - ) - ) - - num_docs = len(doc_id_docs) - - for doc_id, doc in tqdm( - doc_id_docs, - total=num_docs, - ): - # sample fewshot context #TODO: need to offset doc_id by rank now! - fewshot_ctx = self.fewshot_context( - doc, - 0 if self.config.num_fewshot is None else self.config.num_fewshot, - system_instruction, - apply_chat_template, - fewshot_as_multiturn, - chat_template, - gen_prefix=self.doc_to_prefix(doc), - ) - - # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute - inst = self.construct_requests( - doc=doc, - ctx=fewshot_ctx, - metadata=(self.config["task"], doc_id, self.config.repeats), - apply_chat_template=apply_chat_template, - chat_template=chat_template, - ) - - if not isinstance(inst, list): - inst = [inst] - - instances.append(inst) - - # now flatten, this is to allow slicing to work with pickles - - sliced_instances = instances[:og_limit] - - flattened_instances = [ - instance - for instance_group in sliced_instances - for instance in instance_group - ] - - self._instances = flattened_instances - - if len(self._instances) == 0: - raise ValueError("task.build_requests() did not find any docs!") - - if cache_requests and (not cached_instances or rewrite_requests_cache): - save_to_cache(file_name=cache_key, obj=instances) - - @abc.abstractmethod - def construct_requests(self, doc, ctx, **kwargs): - """Uses RequestFactory to construct Requests and returns an iterable of - Requests which will be sent to the LM. - - :param doc: - The document as returned from training_docs, validation_docs, or test_docs. - :param ctx: str - The context string, generated by fewshot_context. This includes the natural - language description, as well as the few shot examples, and the question - part of the document for `doc`. - :param doc_idx: int - The index of a document within `self.test_docs()` or `self.validation_docs()`, - whichever is the main split used. - :param repeats: int - TODO: update this docstring - The number of times each instance in a dataset is inferred on. Defaults to 1, - can be increased for techniques like majority voting. - """ - pass - - @abc.abstractmethod - def process_results(self, doc, results): - """Take a single document and the LM results and evaluates, returning a - dict where keys are the names of submetrics and values are the values of - the metric for that one document - - :param doc: - The document as returned from training_docs, validation_docs, or test_docs. - :param results: - The results of the requests created in construct_requests. - """ - pass - - @abc.abstractmethod - def aggregation(self): - """ - :returns: {str: [metric_score] -> float} - A dictionary where keys are the names of submetrics and values are - functions that aggregate a list of metric scores - """ - pass - - @abc.abstractmethod - def higher_is_better(self): - """ - :returns: {str: bool} - A dictionary where keys are the names of submetrics and values are - whether a higher value of the submetric is better - """ - pass - - def get_config(self, key: str) -> Any: - return getattr(self._config, key, None) - - @classmethod - def count_bytes(cls, doc): - """Used for byte-level perplexity metrics in rolling loglikelihood""" - return len(doc.encode("utf-8")) - - @classmethod - def count_words(cls, doc): - """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!""" - return len(re.split(r"\s+", doc)) - - @utils.positional_deprecated - def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs): - """Returns a fewshot context string that is made up of a prepended description - (if provided), the `num_fewshot` number of examples, and an appended prompt example. - - :param doc: str - The document as returned from training_docs, validation_docs, or test_docs. - :param num_fewshot: int - The number of fewshot examples to provide in the returned context string. - :param rnd: random.Random - The pseudo-random number generator used to randomly sample examples. - WARNING: This is currently a required arg although it's optionalized with a default `None`. - :param description: str - The task's description that will be prepended to the fewshot examples. - :returns: str - The fewshot context. - """ - if rnd is None: - if self.fewshot_rnd is not None: - rnd = self.fewshot_rnd - else: - raise ValueError( - "A `random.Random` generator argument must be provided to `rnd`" - ) - - description = description if description else "" - - if num_fewshot == 0: - labeled_examples = "" - else: - # for sets with no training docs, draw from other set *but ensure no overlap with current doc* - if self.has_training_docs(): - fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd) - else: - if self._fewshot_docs is None: - self._fewshot_docs = list( - self.validation_docs() - if self.has_validation_docs() - else self.test_docs() - ) - - fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1) - - # get rid of the doc that's the one we're evaluating, if it's in the fewshot - fewshotex = [x for x in fewshotex if x != doc][:num_fewshot] - - labeled_examples = ( - "\n\n".join( - [ - self.doc_to_text(doc) + self.doc_to_target(doc) - for doc in fewshotex - ] - ) - + "\n\n" - ) - - example = self.doc_to_text(doc) - return description + labeled_examples + example - - def apply_filters(self) -> Optional[List[Instance]]: - """Iterates over FilterEnsembles and applies them to instances""" - if hasattr(self, "_filters"): - for f in self._filters: - f.apply(self._instances) - else: - eval_logger.warning("No filter defined, passing through instances") - return self._instances - - def dump_config(self) -> dict: - """Returns the config as a dictionary.""" - # TODO: this should only return the overrides applied to a non-YAML task's configuration. - # (num_fewshot) - return self.config.to_dict() - - def set_config(self, key: str, value: Any, update: bool = False) -> None: - """Set or update the configuration for a given key.""" - if key is None: - raise ValueError("Key must be provided.") - - if update: - current_value = getattr(self._config, key, {}) - if not isinstance(current_value, dict): - raise TypeError( - f"Expected a dict for key '{key}', got {type(current_value).__name__} instead." - ) - current_value.update(value) - else: - setattr(self._config, key, value) - - def override_metric(self, metric_name: str) -> None: - """ - Override the default metrics used for evaluation with custom metrics. - - Parameters: - - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics. - """ - ( - self._metric_fn_list, - self._aggregation_list, - self._metric_fn_kwargs, - self._higher_is_better, - ) = ({}, {}, {}, {}) - self._metric_fn_list[metric_name] = get_metric(metric_name) - self._aggregation_list[metric_name] = get_metric_aggregation(metric_name) - self._higher_is_better[metric_name] = is_higher_better(metric_name) - self._metric_fn_kwargs[metric_name] = {} - if not isinstance(self, ConfigurableTask): - self.process_results = lambda x, y: {metric_name: get_metric(metric_name)} - self.aggregation = lambda: { - metric_name: get_metric_aggregation(metric_name) - } - setattr(self._config, "metric_list", [{"metric": metric_name}]) - setattr(self._config, "process_results", None) - - def set_fewshot_seed(self, seed: Optional[int] = None) -> None: - self.fewshot_rnd = random.Random(seed) - if hasattr(self, "sampler"): - self.sampler.rnd = self.fewshot_rnd - - @property - def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: - if self.has_test_docs(): - return self.test_docs() - elif self.has_validation_docs(): - return self.validation_docs() - else: - raise ValueError( - f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" - ) - - def doc_iterator( - self, - *, - rank: int = 0, - limit: Union[int, None] = None, - world_size: int = 1, - samples: Optional[List[int]] = None, - ) -> Iterator[Tuple[int, Any]]: - if samples: - n = len(self.eval_docs) - assert all([e < n for e in samples]), ( - f"Elements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k={n}." - ) - eval_logger.info( - f"{self.config.task}: Evaluating on {len(samples)} examples" - ) - doc_iterator = utils.create_iterator( - enumerate(x for i, x in enumerate(self.eval_docs) if i in samples), - rank=int(rank), - limit=None, # limit does not matter here since we are selecting samples directly - world_size=int(world_size), - ) - else: - limit = int(limit) if limit else None - doc_iterator = utils.create_iterator( - enumerate(self.eval_docs), - rank=int(rank), - limit=limit, - world_size=int(world_size), - ) - return doc_iterator - - -class ConfigurableTask(Task): - VERSION = "Yaml" - OUTPUT_TYPE = None - CONFIG = None - - def __init__( - self, - data_dir=None, - cache_dir=None, - download_mode=None, - config: Optional[dict] = None, - ) -> None: # TODO no super() call here - # Get pre-configured attributes - self._config = self.CONFIG - - # Use new configurations if there was no preconfiguration - if self.config is None: - self._config = TaskConfig(**config) - # Overwrite configs - else: - if config is not None: - self._config.__dict__.update(config) - - if self.config is None: - raise ValueError( - "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg" - ) - - if isinstance(self.config.metadata, dict): - if "version" in self.config.metadata: - self.VERSION = self.config.metadata["version"] - - if self.config.output_type is not None: - if self.config.output_type not in ALL_OUTPUT_TYPES: - raise ValueError( - f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'" - ) - self.OUTPUT_TYPE = self.config.output_type - - if self.config.doc_to_image is not None: - # mark the task as requiring multimodality. - self.MULTIMODAL = True - - if self.config.doc_to_audio: - # mark the task as requiring multimodality. - self.MULTIMODAL = True - - if self.config.unsafe_code is not False: - self.UNSAFE_CODE = True - - if self.config.dataset_path is not None: - self.DATASET_PATH = self.config.dataset_path - - if self.config.dataset_name is not None: - self.DATASET_NAME = self.config.dataset_name - - self._metric_fn_list = {} - self._metric_fn_kwargs = {} - self._aggregation_list = {} - self._higher_is_better = {} - - if self.config.metric_list is None: - # TODO: handle this in TaskConfig.__post_init__ ? - _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type] - - for metric_name in _metric_list: - self._metric_fn_list[metric_name] = get_metric(metric_name) - self._metric_fn_kwargs[metric_name] = {} - self._aggregation_list[metric_name] = get_metric_aggregation( - metric_name - ) - self._higher_is_better[metric_name] = is_higher_better(metric_name) - else: - for metric_config in self.config.metric_list: - if "metric" not in metric_config: - raise ValueError( - "'metric' key not provided for an entry in 'metric_list', must be specified!" - ) - metric_name = metric_config["metric"] - kwargs = { - key: metric_config[key] - for key in metric_config - if key - not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"] - } - hf_evaluate_metric = ( - "hf_evaluate" in metric_config - and metric_config["hf_evaluate"] is True - ) - - if self.config.process_results is not None: - self._metric_fn_list[metric_name] = None - self._metric_fn_kwargs[metric_name] = {} - elif callable(metric_name): - metric_fn = metric_name.__call__ - metric_name = metric_name.__name__ - self._metric_fn_list[metric_name] = metric_fn - self._metric_fn_kwargs[metric_name] = kwargs - else: - self._metric_fn_list[metric_name] = get_metric( - metric_name, hf_evaluate_metric - ) - self._metric_fn_kwargs[metric_name] = kwargs - - if "aggregation" in metric_config: - agg_name = metric_config["aggregation"] - if isinstance(agg_name, str): - self._aggregation_list[metric_name] = get_aggregation(agg_name) - elif callable(agg_name): # noqa: E721 - self._aggregation_list[metric_name] = metric_config[ - "aggregation" - ] - else: - INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()} - metric_agg = get_metric_aggregation(metric_name) - eval_logger.warning( - f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. " - f"using default " - f"aggregation={INV_AGG_REGISTRY[metric_agg]}" - ) - self._aggregation_list[metric_name] = metric_agg - - if "higher_is_better" in metric_config: - self._higher_is_better[metric_name] = metric_config[ - "higher_is_better" - ] - else: - eval_logger.warning( - f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. " - f"using default " - f"higher_is_better={is_higher_better(metric_name)}" - ) - self._higher_is_better[metric_name] = is_higher_better(metric_name) - - self.download(self.config.dataset_kwargs) - self._training_docs = None - self._fewshot_docs = None - - if self.config.filter_list is not None: - self._filters = [] - for filter_config in self.config.filter_list: - filter_name = filter_config["name"] - filter_functions = filter_config["filter"] - components = [] - for function in filter_functions: - kwargs = { - key: function[key] for key in function if key != "function" - } - components.append([function["function"], kwargs]) - filter_pipeline = build_filter_ensemble(filter_name, components) - self._filters.append(filter_pipeline) - else: - # TODO: handle repeats in a more general way rather than just discarding - eval_logger.debug( - "No custom filters defined. Using default 'take_first' filter for handling repeats." - ) - self._filters = [build_filter_ensemble("none", [["take_first", None]])] - - if self.config.use_prompt is not None: - eval_logger.info(f"loading prompt {self.config.use_prompt}") - self.prompt = get_prompt( - self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME - ) - else: - self.prompt = None - - if self.fewshot_docs() is not None: - self.fewshot_rnd = ( - random.Random() - ) # setting with no seed, to be overridden at a later time - config_sampler: Union[str, Callable] = ( - self.config.fewshot_config.get("sampler", "default") - if self.config.fewshot_config - else "default" - ) - if isinstance(config_sampler, str): - self.sampler = samplers.get_sampler(config_sampler)( - list(self.fewshot_docs()), self, rnd=self.fewshot_rnd - ) - elif callable(config_sampler) and issubclass( - config_sampler, samplers.ContextSampler - ): - self.sampler = config_sampler( - docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd - ) - else: - raise TypeError( - f"fewshot_config.sampler should be a string or callable of ContextSampler type, " - f"not {type(config_sampler)}" - ) - - self.task_docs = self.eval_docs - - # Test One Doc - self.features = list(self.task_docs.features.keys()) - self.multiple_input = 0 - self.multiple_target = 0 - test_doc = self.task_docs[0] - test_text = self.doc_to_text(test_doc) - test_target = self.doc_to_target(test_doc) - - if self.config.doc_to_choice is not None: - test_choice = self.doc_to_choice(test_doc) - if not isinstance(test_choice, list): - eval_logger.error("doc_to_choice must return list") - else: - num_choice = len(test_choice) - - if isinstance(test_text, int): - eval_logger.debug( - "doc_to_text returned an int. Assuming multiple inputs." - ) - self.multiple_input = num_choice - else: - test_choice = None - - if isinstance(test_target, list): - eval_logger.debug( - "doc_to_target returned a list. Assuming multiple targets." - ) - self.multiple_target = len(test_target) - else: - if (isinstance(test_target, int)) and (test_choice is not None): - test_target = test_choice[test_target] - else: - test_target = str(test_target) - - if test_choice is not None: - check_choices = test_choice - else: - check_choices = [test_target] - if self.config.doc_to_choice is not None: - for choice in check_choices: - choice_has_whitespace = True if choice[0].isspace() else False - delimiter_has_whitespace = ( - True - if self.config.target_delimiter.rstrip() - != self.config.target_delimiter - else False - ) - - if delimiter_has_whitespace and choice_has_whitespace: - eval_logger.debug( - f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace' - ) - elif (not delimiter_has_whitespace) and (not choice_has_whitespace): - eval_logger.debug( - f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace' - ) - - def download( - self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ) -> None: - if isinstance(self.config.custom_dataset, Callable): - eval_logger.warning( - f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager." - + "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme." - ) - self.dataset = self.config.custom_dataset( - **(self.config.metadata or {}), **(self.config.dataset_kwargs or {}) - ) - else: - self.dataset = datasets.load_dataset( - path=self.DATASET_PATH, - name=self.DATASET_NAME, - **dataset_kwargs if dataset_kwargs is not None else {}, - ) - - def has_training_docs(self) -> bool: - if self.config.training_split is not None: - return True - else: - return False - - def has_validation_docs(self) -> bool: - if self.config.validation_split is not None: - return True - else: - return False - - def has_test_docs(self) -> bool: - if self.config.test_split is not None: - return True - else: - return False - - def training_docs(self) -> datasets.Dataset: - if self.has_training_docs(): - if self.config.process_docs is not None: - return self.config.process_docs( - self.dataset[self.config.training_split] - ) - return self.dataset[self.config.training_split] - - def validation_docs(self) -> datasets.Dataset: - if self.has_validation_docs(): - if self.config.process_docs is not None: - return self.config.process_docs( - self.dataset[self.config.validation_split] - ) - return self.dataset[self.config.validation_split] - - def test_docs(self) -> datasets.Dataset: - if self.has_test_docs(): - if self.config.process_docs is not None: - return self.config.process_docs(self.dataset[self.config.test_split]) - return self.dataset[self.config.test_split] - - def fewshot_docs(self): - if self.config.fewshot_split is not None: - if self.config.process_docs is not None: - return self.config.process_docs(self.dataset[self.config.fewshot_split]) - return self.dataset[self.config.fewshot_split] - elif ( - self.config.fewshot_config is not None - and self.config.fewshot_config.get("samples", None) is not None - ): - if isinstance(self.config.fewshot_config["samples"], list): - return self.config.fewshot_config["samples"] - elif callable(self.config.fewshot_config["samples"]): - return self.config.fewshot_config["samples"]() - else: - raise Exception( - "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list." - ) - else: - if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): - eval_logger.warning( - f"[Task: {self.config.task}] " - "num_fewshot > 0 but fewshot_split is None. " - "using preconfigured rule." - ) - return super().fewshot_docs() - - @staticmethod - def append_target_question( - labeled_examples: List[Dict[str, str]], - question: str, - fewshot_as_multiturn: bool = False, - gen_prefix: Optional[str] = None, - ) -> None: - """Adds a target question to the labeled examples list. - If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry. - Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant. - """ - if not fewshot_as_multiturn: - # if no messages or last message is system, append as new user entry - if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system": - labeled_examples.append({"role": "user", "content": question}) - # if last message is user, append to it to avoid two user messages in a row - else: - labeled_examples[-1]["content"] += question - else: - # if fewshot_as_multiturn is True, append as next user entry (last is always assistant) - labeled_examples.append({"role": "user", "content": question}) - if gen_prefix: - labeled_examples.append({"role": "assistant", "content": gen_prefix}) - - @utils.positional_deprecated - def fewshot_context( - self, - doc: dict, - num_fewshot: int, - system_instruction: Optional[str] = None, - apply_chat_template: bool = False, - fewshot_as_multiturn: bool = False, - chat_template: Optional[Callable] = None, - gen_prefix: Optional[str] = None, - ) -> Union[str, List[str]]: - """Returns a fewshot context string that is made up of a prepended description - (if provided), the `num_fewshot` number of examples, and an appended prompt example. - - :param doc: str - The document as returned from training_docs, validation_docs, or test_docs. - :param num_fewshot: int - The number of fewshot examples to provide in the returned context string. - :param system_instruction: str - System instruction to be applied to the prompt. - :param apply_chat_template: bool - Whether to apply the chat template to the fewshot context. - :param fewshot_as_multiturn: bool - Whether to provide the fewshot examples as a multiturn conversation or a single user turn. - :param chat_template: - callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string. - :param gen_prefix: - String to append after the <|assistant|> token. - :returns: str - The fewshot context. - """ - if apply_chat_template: - labeled_examples = [] - else: - labeled_examples = "" - - # get task description - if description := self.config.description: - description = utils.apply_template(self.config.description, doc) - - # create system prompt based on the provided system instruction and description - if system_instruction is not None and description: - system_prompt = ( - f"{system_instruction}{self.sampler.fewshot_delimiter}{description}" - ) - elif system_instruction is not None: - system_prompt = system_instruction - elif description: - system_prompt = description - else: - system_prompt = "" - - # add system prompt if specified - if system_prompt: - if apply_chat_template: - labeled_examples.append({"role": "system", "content": system_prompt}) - else: - labeled_examples = system_prompt - # if few-shot - append examples after the system prompt - if num_fewshot > 0: - if apply_chat_template: - labeled_examples.extend( - self.sampler.get_chat_context( - doc, - num_fewshot, - fewshot_as_multiturn, - gen_prefix=gen_prefix, - ) - ) - else: - labeled_examples += self.sampler.get_context( - doc, num_fewshot, gen_prefix=gen_prefix - ) - - example = self.doc_to_text(doc) - if apply_chat_template: - if self.multiple_input: - # TODO: append prefill? - if not labeled_examples: - return "" - return chat_template(labeled_examples) - if isinstance(example, str): - self.append_target_question( - labeled_examples, - example, - fewshot_as_multiturn, - gen_prefix=gen_prefix, - ) - # for loglikelihood create a list of questions with appended choices - elif isinstance(example, list): - labeled_examples_list = [] - # copy chat history for each example and append the answer - for ex in example: - chat = deepcopy(labeled_examples) - self.append_target_question( - chat, - ex, - fewshot_as_multiturn, - gen_prefix=gen_prefix, - ) - # TODO: append prefill? - labeled_examples_list.append( - chat_template( - chat, - add_generation_prompt=False if gen_prefix else True, - ) - ) - return labeled_examples_list - # if example is an integer, append the choice or convert to string - elif isinstance(example, int): - if self.config.doc_to_choice is not None: - choices = self.doc_to_choice(doc) - self.append_target_question( - labeled_examples, - choices[example], - fewshot_as_multiturn, - gen_prefix=gen_prefix, - ) - else: - self.append_target_question( - labeled_examples, - str(example), - fewshot_as_multiturn, - gen_prefix=gen_prefix, - ) - # return lm.apply_chat_template(labeled_examples) - return chat_template( - labeled_examples, - add_generation_prompt=False if gen_prefix else True, - ) - else: - prefix = ( - self.config.target_delimiter + gen_prefix - if gen_prefix is not None - else "" - ) - if self.multiple_input: - return labeled_examples - if isinstance(example, str): - return labeled_examples + example + prefix - elif isinstance(example, list): - return [labeled_examples + ex + prefix for ex in example] - elif isinstance(example, int): - if self.config.doc_to_choice is not None: - choices = self.doc_to_choice(doc) - return labeled_examples + choices[example] + prefix - else: - return labeled_examples + str(example) + prefix - - def apply_filters(self) -> Optional[List[Instance]]: - """Iterates over FilterEnsembles and applies them to instances""" - if hasattr(self, "_filters"): - for f in self._filters: - f.apply(self._instances) - else: - eval_logger.warning("No filter defined, passing through instances") - return self._instances - - def should_decontaminate(self): - return self.config.should_decontaminate - - def doc_to_decontamination_query(self, doc: dict): - if self.config.should_decontaminate: - if self.config.doc_to_decontamination_query is None: - return self.doc_to_text(doc) - else: - doc_to_decontamination_query = self.config.doc_to_decontamination_query - if doc_to_decontamination_query in self.features: - return doc[doc_to_decontamination_query] - elif callable(doc_to_decontamination_query): - return doc_to_decontamination_query(doc) - else: - return ast.literal_eval( - utils.apply_template( - self.config.doc_to_decontamination_query, doc - ) - ) - - def _process_doc(self, doc: dict) -> dict: - """ - Override this to process (detokenize, strip, replace, etc.) individual - documents. This can be used in a map over documents of a data split. - E.g. `map(self._process_doc, self.dataset["validation"])` - - :return: dict - The processed version of the specified `doc`. - """ - return doc - - def doc_to_text(self, doc, doc_to_text=None): - if self.prompt is not None: - doc_to_text = self.prompt - elif doc_to_text is not None: - doc_to_text = doc_to_text - else: - doc_to_text = self.config.doc_to_text - - if isinstance(doc_to_text, int): - return doc_to_text - elif isinstance(doc_to_text, str): - if doc_to_text in self.features: - # if self.config.doc_to_choice is not None: - # return self.doc_to_choice(doc)[doc[doc_to_text]] - # else: - return doc[doc_to_text] - else: - text_string = utils.apply_template(doc_to_text, doc) - if text_string.isdigit() and self._config.doc_to_choice is not None: - return ast.literal_eval(text_string) - else: - return text_string - elif callable(doc_to_text): - return doc_to_text(doc) - # Used when applying a Promptsource template - elif hasattr(doc_to_text, "apply"): - applied_prompt = doc_to_text.apply(doc) - if len(applied_prompt) == 2: - return applied_prompt[0] - else: - eval_logger.warning("Applied prompt returns empty string") - return self.config.fewshot_delimiter - else: - print(type(doc_to_text)) - raise TypeError - - def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]: - if self.prompt is not None: - doc_to_target = self.prompt - elif doc_to_target is not None: - doc_to_target = doc_to_target - else: - doc_to_target = self.config.doc_to_target - - if isinstance(doc_to_target, int): - return doc_to_target - elif isinstance(doc_to_target, str): - if doc_to_target in self.features: - # if self.config.doc_to_choice is not None: - # return self.doc_to_choice(doc)[doc[doc_to_target]] - # else: - return doc[doc_to_target] - else: - target_string = utils.apply_template(doc_to_target, doc) - if target_string.isdigit() and self._config.doc_to_choice is not None: - return ast.literal_eval(target_string) - elif ( - len(target_string) >= 2 - and (target_string[0] == "[") - and (target_string[-1] == "]") - ): - try: - return ast.literal_eval(target_string) - except (SyntaxError, ValueError): - return target_string - else: - return target_string - elif isinstance(doc_to_target, list): - return doc_to_target - elif callable(doc_to_target): - return doc_to_target(doc) - # Used when applying a Promptsource template - elif hasattr(doc_to_target, "apply"): - applied_prompt = doc_to_target.apply(doc) - if len(applied_prompt) == 2: - return applied_prompt[1] - else: - eval_logger.warning("Applied prompt returns empty string") - return self.config.fewshot_delimiter - else: - raise TypeError - - def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]: - if self.prompt is not None: - doc_to_choice = self.prompt - elif doc_to_choice is not None: - doc_to_choice = doc_to_choice - elif self.config.doc_to_choice is None: - eval_logger.error("doc_to_choice was called but not set in config") - else: - doc_to_choice = self.config.doc_to_choice - - if isinstance(doc_to_choice, str): - if doc_to_choice in self.features: - return doc[doc_to_choice] - else: - return ast.literal_eval(utils.apply_template(doc_to_choice, doc)) - elif isinstance(doc_to_choice, list): - return doc_to_choice - elif isinstance(doc_to_choice, dict): - return list(doc_to_choice.values()) - elif callable(doc_to_choice): - return doc_to_choice(doc) - elif hasattr(doc_to_choice, "get_answer_choices_list"): - return doc_to_choice.get_answer_choices_list(doc) - else: - raise TypeError - - def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]: - if doc_to_image is not None: - doc_to_image = doc_to_image - elif self.config.doc_to_image is not None: - doc_to_image = self.config.doc_to_image - else: - return None - - if isinstance(doc_to_image, list): - image_feature = [ - self.doc_to_image(doc, feature) for feature in doc_to_image - ] - return [feature for feature in image_feature if feature is not None] - elif isinstance(doc_to_image, str): - if doc_to_image in self.features: - return doc[doc_to_image] - else: - return ast.literal_eval(utils.apply_template(doc_to_image, doc)) - elif callable(doc_to_image): - return doc_to_image(doc) - else: - return None - - def doc_to_audio(self, doc: Any, doc_to_audio=None) -> Union[int, str, list]: - if doc_to_audio is not None: - doc_to_audio = doc_to_audio - elif self.config.doc_to_audio is not None: - doc_to_audio = self.config.doc_to_audio - else: - return None - - if isinstance(doc_to_audio, list): - audio_feature = [ - self.doc_to_audio(doc, feature) for feature in doc_to_audio - ] - return [feature for feature in audio_feature if feature is not None] - elif isinstance(doc_to_audio, str): - if doc_to_audio in self.features: - return doc[doc_to_audio] - else: - return ast.literal_eval(utils.apply_template(doc_to_audio, doc)) - elif callable(doc_to_audio): - return doc_to_audio(doc) - else: - return None - - def doc_to_prefix(self, doc): - if (gen_prefix := self.config.gen_prefix) is not None: - if gen_prefix in self.features: - return doc[gen_prefix] - else: - return utils.apply_template(gen_prefix, doc) - return None - - def construct_requests( - self, doc: dict, ctx: str, **kwargs - ) -> Union[List[Instance], Instance]: - apply_chat_template = kwargs.pop("apply_chat_template", False) - chat_template: Callable | None = kwargs.pop("chat_template", None) - - aux_arguments = None - - if self.OUTPUT_TYPE == "loglikelihood": - arguments = (ctx, self.doc_to_target(doc)) - elif self.OUTPUT_TYPE == "loglikelihood_rolling": - arguments = (self.doc_to_target(doc),) - elif self.OUTPUT_TYPE == "multiple_choice": - choices = self.doc_to_choice(doc) - target_delimiter = self.config.target_delimiter - if apply_chat_template: - target_delimiter = "" - if self.multiple_input: - # If there are multiple inputs, choices are placed in the ctx - # apply chat_template to choices if apply_chat_template - cont = self.doc_to_target(doc) - - arguments = [ - ( - ctx - + ( - chat_template([{"role": "user", "content": choice}]) - if apply_chat_template - else choice - ), - f"{target_delimiter}{cont}", - ) - for choice in choices - ] - else: - # Otherwise they are placed in the continuation - arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] - - # TODO: we should raise a warning telling users this will at most ~2x runtime. - if "acc_mutual_info" in self._metric_fn_list.keys(): - # if we are calculating multiple choice accuracy - # using mutual information instead of raw loglikelihood as metric, need unconditional lls. - - # here mutual info refers to calculating - # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice)) - # in other words normalizing by subtracting the unconditional logprob of each choice. - # TODO: should these be strided? will have to modify the processing in process_results if so - aux_arguments = [ - ("", f"{target_delimiter}{choice}") for choice in choices - ] - - arguments.extend(aux_arguments) - - elif self.OUTPUT_TYPE == "generate_until": - arguments = (ctx, deepcopy(self.config.generation_kwargs)) - - multimodal_arg = {} - if ( - self.config.doc_to_image - ): # TODO: ensure that non-multimodal tasks aren't getting visual args - multimodal_arg = { - **multimodal_arg, - **{"visual": self.doc_to_image(doc)}, - } - - if ( - self.config.doc_to_audio - ): # TODO: ensure that non-multimodal tasks aren't getting audio args - multimodal_arg = { - **multimodal_arg, - **{"audio": self.doc_to_audio(doc)}, - } - - if bool(multimodal_arg): - if isinstance(arguments, list): - arguments = [arg + (multimodal_arg,) for arg in arguments] - else: - arguments = arguments + (multimodal_arg,) - - if self.OUTPUT_TYPE == "multiple_choice": - request_list = [ - Instance( - request_type="loglikelihood", - doc=doc, - arguments=arg, - idx=i, - **kwargs, - ) - for i, arg in enumerate(arguments) - ] - - return request_list - - return Instance( - request_type=self.OUTPUT_TYPE, - doc=doc, - arguments=arguments, - idx=0, - **kwargs, - ) - - def process_results(self, doc, results): - if callable(self.config.process_results): - return self.config.process_results(doc, results) - - result_dict = {} - use_metric = list(self._metric_fn_list.keys()) - if self.OUTPUT_TYPE == "loglikelihood": - results = results[0] - ll, is_greedy = results - return { - **({"perplexity": ll} if "perplexity" in use_metric else {}), - **({"acc": int(is_greedy)} if "acc" in use_metric else {}), - } - elif self.OUTPUT_TYPE == "loglikelihood_rolling": - (loglikelihood,) = results - _words = self.count_words(self.doc_to_target(doc)) - _bytes = self.count_bytes(self.doc_to_target(doc)) - return { - **( - {"word_perplexity": (loglikelihood, _words)} - if "word_perplexity" in use_metric - else {} - ), - **( - {"byte_perplexity": (loglikelihood, _bytes)} - if "byte_perplexity" in use_metric - else {} - ), - **( - {"bits_per_byte": (loglikelihood, _bytes)} - if "bits_per_byte" in use_metric - else {} - ), - } - elif self.OUTPUT_TYPE == "multiple_choice": - lls, is_greedy = zip(*results) - - # retrieve choices in List[str] form, to compute choice lengths, etc. - choices = self.doc_to_choice(doc) - completion_len = np.array([float(len(i)) for i in choices]) - - if ( - 2 * len(choices) == len(lls) - and "acc_mutual_info" in self._metric_fn_list.keys() - ): - # then we are doing mutual info. - # this stores the "dryrun" / unconditional answer loglikelihoods - # as we extend the args list with unconditional ("", continuation) pairs - lls_unconditional = lls[len(choices) :] - if len(lls_unconditional) != len(choices): - raise ValueError - # and this stores our "regular" conditional loglikelihoods - lls = lls[: len(choices)] - - pred = np.argmax(lls) - pred_norm = np.argmax(lls / completion_len) - - if self.multiple_input: - gold = self.doc_to_text(doc) - else: - gold = self.doc_to_target(doc) - - gold_index_error = False - if isinstance(gold, list): - gold = [i if i < len(choices) else -100 for i in gold] - if -100 in gold: - gold_index_error = True - else: - if isinstance(gold, int): - gold = gold if gold < len(choices) else -100 - elif isinstance(gold, str): - gold = choices.index(gold) if gold in choices else -100 - - if gold == -100: - gold_index_error = True - - if gold_index_error: - eval_logger.warning( - f"Label index was not in within range of available choices," - f"Sample:\n\n{doc}\n\n" - ) - - if self.multiple_target: - acc = 1.0 if pred in gold else 0.0 - acc_norm = 1.0 if pred_norm in gold else 0.0 - exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold])) - else: - acc = 1.0 if pred == gold else 0.0 - acc_norm = 1.0 if pred_norm == gold else 0.0 - # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly - exact_match = int(is_greedy[gold]) if gold != -100 else 0 - - prob_norm = utils.softmax(lls) - - # TODO use keyword arguments to the metric? - # gold, pred, norm stuff, the original lls, - result_dict = { - **({"acc": acc} if "acc" in use_metric else {}), - **({"f1": (gold, pred)} if "f1" in use_metric else {}), - **({"mcc": (gold, pred)} if "mcc" in use_metric else {}), - **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), - **({"exact_match": exact_match} if "exact_match" in use_metric else {}), - **( - {"brier_score": (gold, prob_norm)} - if "brier_score" in use_metric - else {} - ), - } - - if "acc_mutual_info" in use_metric: - lls_mutual_info = [ - ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional) - ] - acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 - result_dict["acc_mutual_info"] = acc_mutual_info - - elif self.OUTPUT_TYPE == "generate_until": - gold = self.doc_to_target(doc) - result = results[0] - if self.config.doc_to_choice is not None: - # If you set doc_to_choice, - # it assumes that doc_to_target returns a number. - choices = self.doc_to_choice(doc) - gold = choices[gold] - # we expect multiple_targets to be a list. - elif self.multiple_target: - gold = list(gold) - # TODO: handle this better - elif type(gold) is not type(result) and not ( - "bypass" in self._metric_fn_list.keys() or isinstance(result, list) - ): - # cast gold to the same type as result - gold = type(result)(gold) - - for metric in self._metric_fn_list.keys(): - if self.multiple_target: - # in the case where we have multiple targets, - # return true if any are true - # TODO: this may break for multipLe_target, non zero-or-1 metrics - scores = [] - if not isinstance(gold, list): - # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer - # print(gold) - gold = [gold] - if metric == "exact_match": - result = [result for _ in range(len(gold))] - scores = self._metric_fn_list[metric]( - references=gold, - predictions=result, - **self._metric_fn_kwargs[metric], - )[metric] - result_score = 1.0 if scores > 0.0 else 0.0 - else: - for gold_option in gold: - try: - result_score = self._metric_fn_list[metric]( - references=[gold_option], - predictions=[result], - **self._metric_fn_kwargs[metric], - ) - except ( - TypeError - ): # TODO: this is hacky and I don't want to do it - result_score = self._metric_fn_list[metric]( - [gold_option, result] - ) - if isinstance(result_score, dict): - # TODO: this handles the case where HF evaluate returns a dict. - result_score = result_score[metric] - scores.append(result_score) - if any(scores): - result_score = 1.0 - else: - result_score = 0.0 - else: - try: - result_score = self._metric_fn_list[metric]( - references=[gold], - predictions=[result], - **self._metric_fn_kwargs[metric], - ) - except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics - result_score = self._metric_fn_list[metric]([gold, result]) - if isinstance(result_score, dict): - # TODO: this handles the case where HF evaluate returns a dict. - # This allows for multiple metrics to be returned from the same function - for k, v in result_score.items(): - result_dict[k] = v - else: - result_dict[metric] = result_score - else: - raise ValueError( - f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", - "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'", - ) - - return result_dict - - def aggregation(self) -> dict: - return self._aggregation_list - - def higher_is_better(self) -> dict: - return self._higher_is_better - - def get_config(self, key: str) -> Any: - return getattr(self._config, key, None) - - @property - def task_name(self) -> Any: - return getattr(self.config, "task", None) - - def __repr__(self): - return ( - f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," - f"output_type={self.OUTPUT_TYPE}," - f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," - f"num_samples={len(self.eval_docs)})" - ) - - -class MultipleChoiceTask(Task): - OUTPUT_TYPE = "loglikelihood" - - def doc_to_target(self, doc: dict) -> str: - return " " + doc["choices"][doc["gold"]] - - def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]: - # TODO: add mutual info here? - return [ - Instance( - request_type="loglikelihood", - doc=doc, - arguments=(ctx, " {}".format(choice)), - idx=i, - **kwargs, - ) - for i, choice in enumerate(doc["choices"]) - ] - - def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict: - results = [ - res[0] for res in results - ] # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere? - gold = doc["gold"] - - acc = 1.0 if np.argmax(results) == gold else 0.0 - completion_len = np.array([float(len(i)) for i in doc["choices"]]) - acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0 - - return { - "acc": acc, - "acc_norm": acc_norm, - } - - def higher_is_better(self) -> dict: - return { - "acc": True, - "acc_norm": True, - } - - def aggregation(self) -> dict: - return { - "acc": mean, - "acc_norm": mean, - } - - -class PerplexityTask(Task): - OUTPUT_TYPE = "loglikelihood_rolling" - - def has_training_docs(self) -> bool: - return False - - def fewshot_examples(self, k: int, rnd) -> List: - if k != 0: - raise ValueError( - "The number of fewshot examples must be 0 for perplexity tasks." - ) - return [] - - def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]: - if num_fewshot != 0: - raise ValueError( - "The number of fewshot examples must be 0 for perplexity tasks." - ) - - return "" - - def higher_is_better(self) -> dict: - return { - "word_perplexity": False, - "byte_perplexity": False, - "bits_per_byte": False, - } - - def doc_to_decontamination_query(self, doc): - return doc - - def doc_to_text(self, doc) -> str: - return "" - - def doc_to_target(self, doc): - return doc - - def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs): - if bool(ctx): - raise ValueError - - return Instance( - request_type=self.OUTPUT_TYPE, - doc=doc, - arguments=(self.doc_to_target(doc),), - idx=0, - **kwargs, - ) - - def process_results(self, doc: dict, results: Tuple[float]) -> dict: - (loglikelihood,) = results - words = self.count_words(self.doc_to_target(doc)) - bytes_ = self.count_bytes(self.doc_to_target(doc)) - return { - "word_perplexity": (loglikelihood, words), - "byte_perplexity": (loglikelihood, bytes_), - "bits_per_byte": (loglikelihood, bytes_), - } - - def aggregation(self) -> dict: - return { - "word_perplexity": weighted_perplexity, - "byte_perplexity": weighted_perplexity, - "bits_per_byte": bits_per_byte, - } - - @classmethod - def count_bytes(cls, doc) -> int: - return len(doc.encode("utf-8")) - - @classmethod - def count_words(cls, doc) -> int: - """Downstream tasks with custom word boundaries should override this!""" - return len(re.split(r"\s+", doc)) diff --git a/lm-evaluation-harness/lm_eval/caching/cache.py b/lm-evaluation-harness/lm_eval/caching/cache.py deleted file mode 100644 index f8d293b0ff8b1ebac186f5ac078cdb49227562db..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/caching/cache.py +++ /dev/null @@ -1,59 +0,0 @@ -import hashlib -import logging -import os - -import dill - - -eval_logger = logging.getLogger(__name__) - - -MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) - -OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH") - - -PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache" - -# This should be sufficient for uniqueness -HASH_INPUT = "EleutherAI-lm-evaluation-harness" - -HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest() - -FILE_SUFFIX = f".{HASH_PREFIX}.pickle" - - -def load_from_cache(file_name: str, cache: bool = False): - if not cache: - return - try: - path = f"{PATH}/{file_name}{FILE_SUFFIX}" - - with open(path, "rb") as file: - cached_task_dict = dill.loads(file.read()) - return cached_task_dict - - except Exception: - eval_logger.debug(f"{file_name} is not cached, generating...") - pass - - -def save_to_cache(file_name, obj): - if not os.path.exists(PATH): - os.mkdir(PATH) - - file_path = f"{PATH}/{file_name}{FILE_SUFFIX}" - - eval_logger.debug(f"Saving {file_path} to cache...") - with open(file_path, "wb") as file: - file.write(dill.dumps(obj)) - - -# NOTE the "key" param is to allow for flexibility -def delete_cache(key: str = ""): - files = os.listdir(PATH) - - for file in files: - if file.startswith(key) and file.endswith(FILE_SUFFIX): - file_path = f"{PATH}/{file}" - os.unlink(file_path) diff --git a/lm-evaluation-harness/lm_eval/decontamination/__init__.py b/lm-evaluation-harness/lm_eval/decontamination/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/lm-evaluation-harness/lm_eval/decontamination/archiver.py b/lm-evaluation-harness/lm_eval/decontamination/archiver.py deleted file mode 100644 index c132232116c2ae5f5ab1dc3a2a0afc0dbd4ef1bd..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/decontamination/archiver.py +++ /dev/null @@ -1,174 +0,0 @@ -import datetime -import io -import json -import mmap -import os -from pathlib import Path -from typing import Any - -import jsonlines -import tqdm -import zstandard - - -def json_serial(obj: Any) -> str: - """JSON serializer for objects not serializable by default json code""" - - if isinstance(obj, (datetime.datetime,)): - return obj.isoformat() - raise TypeError("Type %s not serializable" % type(obj)) - - -# Modified version of lm_dataformat Archive for single file. -class Archive: - def __init__(self, file_path: str, compression_level: int = 3) -> None: - self.file_path = file_path - dir_name = os.path.dirname(file_path) - if dir_name: - os.makedirs(dir_name, exist_ok=True) - self.fh = open(self.file_path, "wb") - self.cctx = zstandard.ZstdCompressor(level=compression_level) - self.compressor = self.cctx.stream_writer(self.fh) - - def add_data(self, data, meta=None) -> None: - if meta is None: - meta = {} - self.compressor.write( - json.dumps({"text": data, "meta": meta}, default=json_serial).encode( - "UTF-8" - ) - + b"\n" - ) - - def commit(self) -> None: - self.compressor.flush(zstandard.FLUSH_FRAME) - self.fh.flush() - self.fh.close() - - -# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm. -class Reader: - def __init__(self) -> None: - pass - - def read( - self, - file, - get_meta: bool = False, - autojoin_paragraphs: bool = True, - para_joiner: str = "\n\n", - ): - with open(file, "rb") as fh: - self.fh = fh - cctx = zstandard.ZstdDecompressor() - reader = io.BufferedReader(cctx.stream_reader(fh)) - rdr = jsonlines.Reader(reader) - for ob in rdr: - # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility. - if isinstance(ob, str): - assert not get_meta - yield ob - continue - - text = ob["text"] - - if autojoin_paragraphs and isinstance(text, list): - text = para_joiner.join(text) - - if get_meta: - yield text, (ob["meta"] if "meta" in ob else {}) - else: - yield text - - -class TextArchive: - def __init__(self, file_path, mode: str = "rb+") -> None: - self.file_path = file_path - dir_name = os.path.dirname(file_path) - if dir_name: - os.makedirs(dir_name, exist_ok=True) - - if not os.path.exists(file_path): - Path(file_path).touch() - - self.fh = open(self.file_path, mode) - - def add_data(self, data) -> None: - self.fh.write(data.encode("UTF-8") + b"\n") - - def commit(self) -> None: - self.fh.flush() - self.fh.close() - - -class TextReader: - def __init__(self, file_path) -> None: - self.file_path = file_path - - # Optimized mmap read with infrequent tqdm updates to maintain speed - # Tested up to 250MB/s. - def read_tqdm(self, update_frequency: int = 10000): - current_file_position = 0 - line_counter = 0 - with ( - open(self.file_path, "r", encoding="utf-8") as fh, - tqdm.tqdm( - total=os.path.getsize(self.file_path), - dynamic_ncols=True, - unit="byte", - unit_scale=1, - ) as progress, - ): - with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: - for line in iter(mmap_obj.readline, b""): - line = line.decode("utf-8") - line_counter += 1 - if line_counter == update_frequency: - new_file_pos = mmap_obj.tell() - bytes_read = new_file_pos - current_file_position - current_file_position = new_file_pos - progress.update(bytes_read) - line_counter = 0 - yield line[:-1] - - def read_and_tell(self): - current_file_position = 0 - with open(self.file_path, "r", encoding="utf8") as fh: - with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: - for line in iter(mmap_obj.readline, b""): - line = line.decode("utf-8") - new_file_pos = mmap_obj.tell() - raw_bytes_read = new_file_pos - current_file_position - current_file_position = new_file_pos - yield line[:-1], raw_bytes_read - - def read(self): - with open(self.file_path, "r", encoding="utf8") as fh: - with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: - for line in iter(mmap_obj.readline, b""): - line = line.decode("utf-8") - yield line[:-1] - - def read_slow(self): - with open(self.file_path, "r", encoding="utf8") as fh: - while True: - line = fh.readline() - if line == -1 or line == "": - break - else: - yield line[:-1] - - -# Optimized for speed. Decompresses the archive in shell before -# using the mmap'd TextReader. -class ZStdTextReader: - def __init__(self, file) -> None: - self.file = file - - def read_tqdm(self): - decompressed_file = self.file[:-4] - print("Decompressing file, please wait...") - os.system(f"zstd -d {self.file}") # linux decompress is faster - reader = TextReader(decompressed_file) - yield from reader.read_tqdm() - os.remove(decompressed_file) diff --git a/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py b/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py deleted file mode 100644 index 2d1250d39bf7cd0272e412452d970ec7c52992c5..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py +++ /dev/null @@ -1,166 +0,0 @@ -import collections -import glob -import json -import os -import pickle -import random -import time - -from .archiver import ZStdTextReader -from .janitor import Janitor, word_ngrams - - -# Was used for testing the evaluator decoupled from the full logic below -def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str): - simulated_overlap = 0.1 - contaminated = int(len(docs) * simulated_overlap) - return random.sample(range(len(docs)), contaminated) - - -# Returns a dictionary containing all overlapping documents in each -# task. In the standard use case, an overlap occurs when any of the 13-grams -# found in the task document exist in the training set documents. -# -# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these -# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst" -# files. These should exist in the "ngrams_path" provided to this function. - - -# Algorithm: -# 1. Build lookups for each dataset {ngram: list(document_ids)} -# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]} -# 3. Full scan the 13-grams from the training set against the merged lookup, -# saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)} -# 4. Strip the task_set from the dictionary keys and return -# -# We cache the task+set lookups as well as the overlaps. -def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict: - # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size) - - info_dict_path = os.path.join(ngrams_path, "info.json") - info_dict = json.load(open(info_dict_path, "r", encoding="utf-8")) - ngrams_n_size = info_dict["ngram_size"] - - janitor = Janitor() - - # Build lookup for each dataset first in case we use different task combinations later - print("Building Lookups...") - start = time.perf_counter() - - def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str: - return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps" - - lookups = {} - duplicates = {} # (task_name, task_set): set(doc_ids)} - sets_to_decontaminate = len(docs_by_task_set.keys()) - - for (task_name, task_set), docs in docs_by_task_set.items(): - if not os.path.exists(f"data/{task_name}"): - os.mkdir(f"data/{task_name}") - - # Check if we've decontaminated this combination before - overlaps_dump_path = get_overlaps_dump_path( - task_name, task_set, ngrams_n_size, limit - ) - if os.path.exists(overlaps_dump_path): - duplicates[(task_name, task_set)] = pickle.load( - open(overlaps_dump_path, "rb") - ) - sets_to_decontaminate -= 1 - continue - else: - duplicates[(task_name, task_set)] = set() - - # Build/load the task lookup {ngram: set(documents)}. - task_set_lookup_path = ( - f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup" - ) - if os.path.exists(task_set_lookup_path): - print(f"{task_set_lookup_path} available, loading...") - lookups[(task_name, task_set)] = pickle.load( - open(task_set_lookup_path, "rb") - ) - else: - print(f"{task_set_lookup_path} not available, building...") - lookup = collections.defaultdict(set) - - for doc_id, document in enumerate(docs): - ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size) - for ngram in ngrams: - lookup[ngram].add(doc_id) - - pickle.dump(lookup, open(task_set_lookup_path, "wb")) - lookups[(task_name, task_set)] = lookup - - elapsed = time.perf_counter() - start - print(f"Building lookups took {elapsed:0.5f} seconds.") - - matched_ngrams = [] - - if sets_to_decontaminate > 0: - print("Merging lookups...") - start = time.perf_counter() - merged_lookup = collections.defaultdict(list) - for (task_name, task_set), lookup in lookups.items(): - for ngram, doc_ids in lookup.items(): - merged_lookup[ngram].append((task_name, task_set, doc_ids)) - - elapsed = time.perf_counter() - start - print(f"Merging lookups took {elapsed:0.5f} seconds.") - - print(f"{ngrams_n_size} grams files found in {ngrams_path}:") - files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst")) - print(files) - - for file in files: - start = time.perf_counter() - print(f"Scanning {file}") - reader = ZStdTextReader(file) - total_ngrams = 0 - unique_ngrams = 0 - matching_unique = 0 - non_matching_unique = 0 - - current_ngram = "" - for line in reader.read_tqdm(): # Scan training set ngrams file - total_ngrams += 1 - [ngram, document_id] = line.rsplit(" ", 1) - if ( - ngram != current_ngram - ): # Only need to match the ngram once in training set - unique_ngrams += 1 - current_ngram = ngram - if ngram in merged_lookup: - matched_ngrams.append(ngram) # For logging - matching_unique += 1 - for task_name, task_set, doc_ids in merged_lookup[ngram]: - task_doc_set = duplicates[(task_name, task_set)] - for doc_id in doc_ids: # Record contamination across all relevant task/set combos - task_doc_set.add(doc_id) - del merged_lookup[ngram] # No point matching again - else: - non_matching_unique += 1 - - print(f"Total Ngrams: {total_ngrams}") - print(f"Unique Ngrams: {unique_ngrams}") - print(f"Unique Matching: {matching_unique}") - print(f"Unique Non Matching: {non_matching_unique}") - print("Matched ngrams:") - for ngram in matched_ngrams: - print(ngram) - - elapsed = time.perf_counter() - start - print(f"Read took {elapsed:0.5f} seconds.") - print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second") - - print(duplicates) - - # Dump overlaps separately - for (task_name, task_set), doc_ids in duplicates.items(): - overlaps_dump_path = get_overlaps_dump_path( - task_name, task_set, ngrams_n_size, limit - ) - pickle.dump(doc_ids, open(overlaps_dump_path, "wb")) - - # Strip task set and return - return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()} diff --git a/lm-evaluation-harness/lm_eval/decontamination/janitor.py b/lm-evaluation-harness/lm_eval/decontamination/janitor.py deleted file mode 100644 index cedf8a5717aa8156674836ba236fdcabf36e0487..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/decontamination/janitor.py +++ /dev/null @@ -1,328 +0,0 @@ -import pickle -import re -import string -import traceback -from typing import Iterator, List, Sequence, Tuple, TypeVar - - -# This is a cpp module. Compile janitor_util.cpp with: -# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup -try: - import janitor_util - - JANITOR_CPP = True -except Exception: - print("WARNING: C++ module could not be loaded. Janitor running in python mode") - traceback.print_exc() - JANITOR_CPP = False - -T = TypeVar("T") - - -# Implementation from nltk source -# https://www.nltk.org/_modules/nltk/util.html -def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]: - history = [] - while n > 1: - # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator - try: - next_item = next(sequence) - except StopIteration: - # no more data, terminate the generator - return - history.append(next_item) - n -= 1 - for item in sequence: - history.append(item) - yield tuple(history) - del history[0] - - -def word_ngrams(s: str, n: int) -> Iterator[str]: - """Splits a string into ngram words""" - tokens = s.split() # not a generator :( - ngram_seqs = form_ngrams(iter(tokens), n) - return (" ".join(ngram) for ngram in ngram_seqs) - - -# Does character sequences only - combined faster function to play around with later -# def word_ngrams_indices_combined(sequence, n): -# current_word = "" -# history = [] -# gap = False; -# start = 0 -# end = 0 -# for character in sequence: -# if character == " ": -# if not gap: -# gap = True -# history.append(current_word) -# end += len(current_word) - 1 -# current_word = "" -# if len(history) == n: -# yield (tuple(history), start, end) -# del history[0] -# start = end + 1 -# end = start -# else: -# gap = False -# current_word += character - - -# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python -def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]: - """Splits a string on whitespaces and records the indices of each in the original string. - @:return generator((word, (start_idx, end_idx)), ...) - """ - return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s)) - - -def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]: - """Splits a string into pairs of (ngram words, their start/end indices)""" - tokens_with_indices = split_indices(s) - - # Generator of ngrams of (word, idx_pairs) - # ( - # [(word, (start,end)), (word, (start, end))...], - # [(word, (start, end)), ...], - # ... - # ) - ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n) - - # Generator of pairs of word and index ngrams - # ( - # ([word, word, ...], [(start,end), (start,end), ...]), - # ... - # ) - ngram_indices_pairs = ( - zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices - ) - - # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...) - return ( - (" ".join(ngram_seq), (indices[0][0], indices[-1][1])) - for ngram_seq, indices in ngram_indices_pairs - ) - - -class Janitor: - # FIXME delete_chars: Should anything else go here? Special chars? - def __init__( - self, - ngram_n: int = 13, - window_to_remove: int = 200, - too_dirty_cutoff: int = 10, - minimum_slice_length: int = 200, - delete_chars: str = string.punctuation, - ) -> None: - self.ngram_n = ngram_n - self.window_to_remove = window_to_remove - self.too_dirty_cutoff = too_dirty_cutoff - self.minimum_slice_length = minimum_slice_length - self.delete_chars = delete_chars - - self.dirt_ngrams = set() - - # If in python, we'll translate uppercase to lowercase and delete naughty characters. - # This is fast by python standards - # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st - self.translation_table = str.maketrans( - string.ascii_lowercase + string.ascii_uppercase, # These characters - string.ascii_lowercase * 2, # Become these characters - self.delete_chars, # These are deleted - ) - - ############## - # I/O for saving contamination ngrams - ############## - - def save_contamination_ngrams(self, filename: str) -> None: - with open(filename, "wb") as fp: - pickle.dump(filename, fp) - - def load_contamination_ngrams(self, filename: str) -> None: - with open(filename, "rb") as fp: - self.dirt_ngrams = pickle.load(fp) - - ############## - # Call these :) - ############## - - def register_contaminant(self, dirt_string: str) -> None: - """Register a string as contamination to be removed, e.g. a test set - This breaks the dirt_string into ngrams to store for future cleaning""" - if JANITOR_CPP: - return self.register_contaminant_cpp(dirt_string) - else: - print("WARNING: Janitor running in python mode") - return self.register_contaminant_python(dirt_string) - - def clean(self, dirty_string: str) -> List[str]: - """Clean a string (e.g. a training set) by removing all ngrams previously - registered as contaminants. Returns a list of clean chunks, or empty if - the string was too dirty""" - if JANITOR_CPP: - return self.clean_cpp(dirty_string) - else: - print("WARNING: Janitor running in python mode") - return self.clean_python(dirty_string) - - def _split_chunks( - self, dirty_string: str, dirty_parts: Sequence[Tuple] - ) -> List[str]: - clean_chunks = [] - splice_idx = 0 - end = -1 - for i, (ngram, start, end) in enumerate(dirty_parts): - if i >= self.too_dirty_cutoff: - return [] - start = max(0, start - self.window_to_remove) - end = min(len(dirty_string), end + self.window_to_remove) - - if start - splice_idx > self.minimum_slice_length: - clean_chunks.append(dirty_string[splice_idx:start]) - splice_idx = end - - if end < len(dirty_string) - self.minimum_slice_length: - clean_chunks.append(dirty_string[end + 1 :]) - - return clean_chunks - - ############## - # Fast C++ - ############## - - def register_contaminant_cpp(self, dirt_string) -> None: - self.dirt_ngrams.update( - janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n) - ) - - def clean_cpp(self, dirty_string: str) -> List[str]: - contamination_indices = janitor_util.clean_ngram_with_indices( - dirty_string, self.delete_chars, self.ngram_n - ) - return self._split_chunks(dirty_string, contamination_indices) - - ############## - # Slow python - ############## - - def normalize_string(self, s: str) -> str: - return s.translate(self.translation_table) - - def register_contaminant_python(self, dirt_string: str) -> None: - self.dirt_ngrams.update( - word_ngrams(self.normalize_string(dirt_string), self.ngram_n) - ) - - def clean_python(self, dirty_string: str) -> List[str]: - contamination_indices = ( - (None, *idx_pair) - for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n) - if self.normalize_string(dirty_ngram) in self.dirt_ngrams - ) - return self._split_chunks(dirty_string, contamination_indices) - - -################################################################## -# Tests -################################################################# - -# def print_cpp(): -# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 - -# for i in range(1, 10, 2): -# pprint(janitor_util.clean_ngram(source, string.punctuation, i)) -# for ngram, start, end in \ -# janitor_util.clean_ngram_with_indices(source, string.punctuation, i): -# print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n")) - - -# def test_cpp(): -# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 -# contaminant = "dirty boy. Clean he he" - -# jan_python = Janitor() -# jan_cpp = Janitor() - -# jan_python.register_contaminant_python(contaminant) -# jan_cpp.register_contaminant(contaminant) - -# assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams) - -# assert jan_python.clean_python(source) == jan_cpp.clean(source), \ -# (jan_python.clean_python(source), jan_cpp.clean(source)) - -# print("Passed test, python==cpp") - - -# def benchmark(): -# # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html -# setup = \ -# """ -# with open("data/enwik8", "r") as f: -# data = f.read() -# jan = Janitor(too_dirty_cutoff=1000) -# jan.register_contaminant(''' -# theories is that there is a connection between "geekdom" and autism. -# This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled " -# The [[Geek]] Syndrome", which is a point argued by many in the autism rights -# movement{{ref|Wired}}. This article, many professionals assert, is just one example of -# the media's application of mental disease labels to what is actually variant normal behavior -# &mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual -# interests, even when they seem unusual to others, are not in themselves signs of autism or -# Asperger's syndrome. Others assert that it is actually the medical profession which is applying -# mental disease labels to children who in the past would have simply been accepted as a little -# different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue. -# Due to the recent publicity surrounding autism and autis -# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first, -# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first -# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties -# would last, took a cautious approach, preferring to save the revenue rather than investing it in -# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential -# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his -# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]], -# with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M, -# ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), -# ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the -# Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the -# [[United Arab Emirates]]. After the Emirates gained independence in 1971, -# ''') -# """ - -# n = 1 -# print(f"Timing {n} run on 100 MB") -# print("Register contaminant") -# # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n)) -# print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n)) - -# print("Clean") -# # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n)) -# print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n)) - - -# def test_janitor_general(): -# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 -# contaminant = "dirty boy. Clean he he" - -# jan = Janitor(ngram_n=3) -# jan.register_contaminant(contaminant) -# cleaned = " ".join(jan.clean(source)) -# for contam in jan.dirt_ngrams: -# assert contam not in cleaned, contam - -# filename = "data/saved_contam" -# jan.save_contamination_ngrams(filename) - -# jan = Janitor(ngram_n=3) -# jan.load_contamination_ngrams(filename) -# cleaned = " ".join(jan.clean(source)) -# for contam in jan.dirt_ngrams: -# assert contam not in cleaned, contam - - -# if __name__ == "__main__": -# test() -# # print_cpp() -# # test_cpp() -# # benchmark() diff --git a/lm-evaluation-harness/lm_eval/evaluator.py b/lm-evaluation-harness/lm_eval/evaluator.py deleted file mode 100644 index 4da14c47ce3fc15c52c435c31fa9090ed0c269e9..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/evaluator.py +++ /dev/null @@ -1,761 +0,0 @@ -import itertools -import json -import logging -import random -import time -from collections import defaultdict -from typing import TYPE_CHECKING, List, Optional, Union - -import numpy as np -import torch - -import lm_eval.api.metrics -import lm_eval.api.registry -import lm_eval.api.task -import lm_eval.models -from lm_eval.caching.cache import delete_cache -from lm_eval.evaluator_utils import ( - consolidate_group_results, - consolidate_results, - get_sample_size, - get_subtask_list, - get_task_list, - prepare_print_tasks, - print_writeout, - run_task_tests, -) -from lm_eval.loggers import EvaluationTracker -from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash -from lm_eval.tasks import TaskManager, get_task_dict -from lm_eval.utils import ( - handle_non_serializable, - hash_string, - positional_deprecated, - setup_logging, - simple_parse_args_string, -) - - -if TYPE_CHECKING: - from lm_eval.api.model import LM - from lm_eval.api.task import Task - -eval_logger = logging.getLogger(__name__) - - -@positional_deprecated -def simple_evaluate( - model, - model_args: Optional[Union[str, dict]] = None, - tasks: Optional[List[Union[str, dict, object]]] = None, - num_fewshot: Optional[int] = None, - batch_size: Optional[Union[int, str]] = None, - max_batch_size: Optional[int] = None, - device: Optional[str] = None, - use_cache: Optional[str] = None, - cache_requests: bool = False, - rewrite_requests_cache: bool = False, - delete_requests_cache: bool = False, - limit: Optional[Union[int, float]] = None, - samples: Optional[dict] = None, - bootstrap_iters: int = 100000, - check_integrity: bool = False, - write_out: bool = False, - log_samples: bool = True, - evaluation_tracker: Optional[EvaluationTracker] = None, - system_instruction: Optional[str] = None, - apply_chat_template: Union[bool, str] = False, - fewshot_as_multiturn: bool = False, - gen_kwargs: Union[str, dict, None] = None, - task_manager: Optional[TaskManager] = None, - verbosity=None, - predict_only: bool = False, - random_seed: int = 0, - numpy_random_seed: int = 1234, - torch_random_seed: int = 1234, - fewshot_random_seed: int = 1234, - confirm_run_unsafe_code: bool = False, - metadata: Optional[dict] = None, -): - """Instantiate and evaluate a model on a list of tasks. - - :param model: Union[str, LM] - Name of model or LM object, see lm_eval.models.get_model - :param model_args: Optional[str, dict] - String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object. - Ignored if `model` argument is a LM object. - :param tasks: list[Union[str, dict, Task]] - List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. - :param num_fewshot: int - Number of examples in few-shot context - :param batch_size: int or str, optional - Batch size for model - :param max_batch_size: int, optional - Maximal batch size to try with automatic batch size detection - :param device: str, optional - PyTorch device (e.g. "cpu" or "cuda:0") for running models - :param use_cache: str, optional - A path to a sqlite db file for caching model responses. `None` if not caching. - :param cache_requests: bool, optional - Speed up evaluation by caching the building of dataset requests. `None` if not caching. - :param rewrite_requests_cache: bool, optional - Rewrites all the request cache if set to `True`. `None` if not desired. - :param delete_requests_cache: bool, optional - Deletes all the request cache if set to `True`. `None` if not desired. - :param limit: int or float, optional - Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. - :param samples: dictionary, optional - Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}. - :param bootstrap_iters: - Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. - :param check_integrity: bool - Whether to run the relevant part of the test suite for the tasks - :param write_out: bool - If True, write out an example document and model input for checking task integrity - :param log_samples: bool - If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis - :param system_instruction: str - System instruction to be applied to the prompt - :param apply_chat_template: Union[bool, str] - Specifies whether to apply a chat template to the prompt. - - If set to True, the default chat template is applied. - - If set to a string, applies the specified chat template by name. - Defaults to False (no chat template applied). - :param fewshot_as_multiturn: bool - Whether to provide the fewshot examples as a multiturn conversation or a single user turn. - :param gen_kwargs: dict or comma-separated string - Arguments for model generation - Ignored for all tasks with loglikelihood output_type - :param verbosity: str - Verbosity level for logging - :param predict_only: bool - If true only model outputs will be generated and returned. Metrics will not be evaluated - :param random_seed: int - Random seed for python's random module. If set to None, the seed will not be set. - :param numpy_random_seed: int - Random seed for numpy. If set to None, the seed will not be set. - :param torch_random_seed: int - Random seed for torch. If set to None, the seed will not be set. - :param fewshot_random_seed: int - Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None. - :param metadata: dict - Additional metadata to be added to the task manager. Will get passed to the download function of the task. - - return - Dictionary of results - """ - if verbosity is not None: - setup_logging(verbosity=verbosity) - start_date = time.time() - - if limit is not None and samples is not None: - raise ValueError( - "Either 'limit' or 'samples' must be None, but both are not None." - ) - - if isinstance(model_args, str) and ( - "instruct" in model_args and not apply_chat_template - ): - eval_logger.warning( - "Instruct model detected, but chat template not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)." - ) - - if delete_requests_cache: - eval_logger.info("Deleting requests cache...") - delete_cache() - - seed_message = [] - if random_seed is not None: - # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412 - seed_message.append(f"Setting random seed to {random_seed}") - random.seed(random_seed) - - if numpy_random_seed is not None: - seed_message.append(f"Setting numpy seed to {numpy_random_seed}") - np.random.seed(numpy_random_seed) - - if torch_random_seed is not None: - seed_message.append(f"Setting torch manual seed to {torch_random_seed}") - torch.manual_seed(torch_random_seed) - - if fewshot_random_seed is not None: - seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}") - - if seed_message: - eval_logger.info(" | ".join(seed_message)) - - if tasks is None: - tasks = [] - if len(tasks) == 0: - raise ValueError( - "No tasks specified, or no tasks found. Please verify the task names." - ) - - if gen_kwargs is not None: - if isinstance(gen_kwargs, str): - gen_kwargs = simple_parse_args_string(gen_kwargs) - eval_logger.warning( - f"generation_kwargs: {gen_kwargs} specified through cli, these settings will update set parameters in yaml tasks. " - "Ensure 'do_sample=True' for non-greedy decoding!" - ) - if not gen_kwargs: - gen_kwargs = None - - if isinstance(model, str): - if model_args is None: - eval_logger.warning("model_args not specified. Using defaults.") - model_args = "" - - if isinstance(model_args, dict): - eval_logger.info( - f"Initializing {model} model, with arguments: {model_args}" - ) - lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( - model_args, - { - "batch_size": batch_size, - "max_batch_size": max_batch_size, - "device": device, - }, - ) - - else: - eval_logger.info( - f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}" - ) - lm = lm_eval.api.registry.get_model(model).create_from_arg_string( - model_args, - { - "batch_size": batch_size, - "max_batch_size": max_batch_size, - "device": device, - }, - ) - else: - if not isinstance(model, lm_eval.api.model.LM): - raise TypeError( - f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first." - ) - eval_logger.info("Using pre-initialized model") - lm = model - - if use_cache is not None: - eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}") - lm = lm_eval.api.model.CachingLM( - lm, - use_cache - # each rank receives a different cache db. - # necessary to avoid multiple writes to cache at once - + "_rank" - + str(lm.rank) - + ".db", - ) - - if task_manager is None: - metadata = ( - simple_parse_args_string(model_args) - if isinstance(model_args, str) - else model_args - if isinstance(model_args, dict) - else {} - ) | (metadata or {}) - task_manager = TaskManager(metadata=metadata) - - task_dict = get_task_dict( - tasks, - task_manager, - ) - - # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups. - # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed) - def _adjust_config(task_dict): - adjusted_task_dict = {} - for task_name, task_obj in task_dict.items(): - if isinstance(task_obj, dict): - adjusted_task_dict = { - **adjusted_task_dict, - **{task_name: _adjust_config(task_obj)}, - } - - else: - if task_obj.get_config("output_type") == "generate_until": - if gen_kwargs is not None: - task_obj.set_config( - key="generation_kwargs", value=gen_kwargs, update=True - ) - eval_logger.info( - f"{task_obj.config.task}: Using gen_kwargs: {task_obj.config.generation_kwargs}" - ) - - if predict_only: - eval_logger.info( - f"Processing {task_name} in output-only mode. Metrics will not be calculated!" - ) - # we have to change the class properties post-hoc. This is pretty hacky. - task_obj.override_metric(metric_name="bypass") - - # override tasks' fewshot values to the provided num_fewshot arg value - # except if tasks have it set to 0 manually in their configs--then we should never overwrite that - if num_fewshot is not None: - if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: - eval_logger.info( - f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored." - ) - else: - eval_logger.warning( - f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" - ) - task_obj.set_config(key="num_fewshot", value=num_fewshot) - else: - # if num_fewshot not provided, and the task does not define a default one, default to 0 - if ( - default_num_fewshot := task_obj.get_config("num_fewshot") - ) is None: - task_obj.set_config(key="num_fewshot", value=0) - # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) - task_obj.set_fewshot_seed(seed=fewshot_random_seed) - - adjusted_task_dict[task_name] = task_obj - - return adjusted_task_dict - - task_dict = _adjust_config(task_dict) - - if check_integrity: - run_task_tests(task_list=tasks) - - if evaluation_tracker is not None: - evaluation_tracker.general_config_tracker.log_experiment_args( - model_source=model, - model_args=model_args, - system_instruction=system_instruction, - chat_template=lm.chat_template(apply_chat_template) - if apply_chat_template - else None, - fewshot_as_multiturn=fewshot_as_multiturn, - ) - - results = evaluate( - lm=lm, - task_dict=task_dict, - limit=limit, - samples=samples, - cache_requests=cache_requests, - rewrite_requests_cache=rewrite_requests_cache, - bootstrap_iters=bootstrap_iters, - write_out=write_out, - log_samples=True if predict_only else log_samples, - system_instruction=system_instruction, - apply_chat_template=apply_chat_template, - fewshot_as_multiturn=fewshot_as_multiturn, - verbosity=verbosity, - confirm_run_unsafe_code=confirm_run_unsafe_code, - ) - if verbosity is not None: - setup_logging(verbosity=verbosity) - - if lm.rank == 0: - if isinstance(model, str): - model_name = model - elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"): - model_name = model.config._name_or_path - else: - model_name = type(model).__name__ - - # add info about the model and few shot config - results["config"] = { - "model": model_name, - "model_args": model_args, - } - # add more detailed model info if available - if isinstance(lm, lm_eval.models.huggingface.HFLM): - results["config"].update(lm.get_model_info()) - # add info about execution - results["config"].update( - { - "batch_size": batch_size, - "batch_sizes": ( - list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [] - ), - "device": device, - "use_cache": use_cache, - "limit": limit, - "bootstrap_iters": bootstrap_iters, - "gen_kwargs": gen_kwargs, - "random_seed": random_seed, - "numpy_seed": numpy_random_seed, - "torch_seed": torch_random_seed, - "fewshot_seed": fewshot_random_seed, - } - ) - results["git_hash"] = get_git_commit_hash() - results["date"] = start_date - add_env_info(results) # additional environment info to results - add_tokenizer_info(results, lm) # additional info about tokenizer - return results - else: - return None - - -@positional_deprecated -def evaluate( - lm: "LM", - task_dict, - limit: Optional[int] = None, - samples: Optional[dict] = None, - cache_requests: bool = False, - rewrite_requests_cache: bool = False, - bootstrap_iters: Optional[int] = 100000, - write_out: bool = False, - log_samples: bool = True, - system_instruction: Optional[str] = None, - apply_chat_template: Union[bool, str] = False, - fewshot_as_multiturn: bool = False, - verbosity: str = "INFO", - confirm_run_unsafe_code: bool = False, -): - """Instantiate and evaluate a model on a list of tasks. - - :param lm: obj - Language Model - :param task_dict: dict[str, Task] - Dictionary of tasks. Tasks will be taken to have name type(task).config.task . - :param limit: int, optional - Limit the number of examples per task (only use this for testing) - :param samples: dictionary, optional - Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}. - :param cache_requests: bool, optional - Speed up evaluation by caching the building of dataset requests. - :param rewrite_requests_cache: bool, optional - Rewrites all the request cache if set to `True`. - :param bootstrap_iters: - Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations. - :param write_out: bool - If True, write out an example document and model input for checking task integrity - :param log_samples: bool - If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis - :param system_instruction: str - System instruction to be applied to the prompt - :param apply_chat_template: Union[bool, str] - Specifies whether to apply a chat template to the prompt. - - If set to True, the default chat template is applied. - - If set to a string, applies the specified chat template by name. - Defaults to False (no chat template applied). - :param fewshot_as_multiturn: bool - Whether to provide the fewshot examples as a multiturn conversation or a single user turn. - :param verbosity: str - Verbosity level for logging - :param confirm_run_unsafe_code: bool - Whether to confirm running tasks marked as unsafe. - :return - Dictionary of results - """ - - if limit is not None and samples is not None: - raise ValueError( - "Either 'limit' or 'samples' must be None, but both are not None." - ) - if samples is not None: - eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}") - if apply_chat_template: - eval_logger.warning( - "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details." - ) - # tracks all Instances/requests a model must generate output on. - requests = defaultdict(list) - # stores the amount to pad out reqs per req. type so that - # number of fwd passes per distributed rank is equal - padding_requests = defaultdict(int) - - # get lists of group hierarchy and each type of request - eval_tasks = get_task_list(task_dict) - if not log_samples: - if not all( - "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() - for task_output in eval_tasks - ): - raise ValueError("log_samples must be True for 'bypass' metric-only tasks") - - # validation checks: - # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa. - # 2.are we running code that is marked as unsafe. - incompatible_tasks = [] - for task_output in eval_tasks: - task: Task = task_output.task - - if getattr(task, "MULTIMODAL", False) and not getattr(lm, "MULTIMODAL", False): - incompatible_tasks.append(task_output.task_name) - elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code: - raise ValueError( - f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task." - ) - if len(incompatible_tasks) > 0: - if not getattr(lm, "MULTIMODAL", False): - raise ValueError( - f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type." - ) - # end validation check - - # Cache the limit arg. - limit_arg = limit - limits = [] - for task_output in eval_tasks: - task: Task = task_output.task - - limit = get_sample_size(task, limit_arg) - limits.append(limit) - task.build_all_requests( - limit=limit, - samples=samples.get(task_output.task_name, None) - if samples is not None - else samples, - rank=lm.rank, - world_size=lm.world_size, - cache_requests=cache_requests, - rewrite_requests_cache=rewrite_requests_cache, - system_instruction=system_instruction, - apply_chat_template=bool(apply_chat_template), - fewshot_as_multiturn=fewshot_as_multiturn, - chat_template=getattr(lm, "apply_chat_template") - if apply_chat_template - else None, - tokenizer_name=getattr(lm, "tokenizer_name", "") - if apply_chat_template - else "", - ) - eval_logger.debug( - f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" - ) - if write_out: - print_writeout(task) - # aggregate Instances by LM method requested to get output. - for instance in task.instances: - reqtype = instance.request_type - requests[reqtype].append(instance) - - if lm.world_size > 1: - instances_rnk = torch.tensor(len(task._instances), device=lm.device) - gathered_item = ( - lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() - ) - # "multiple_choice" task types dispatch (several) "loglikelihood" request types - reqtype = ( - "loglikelihood" - if task.OUTPUT_TYPE == "multiple_choice" - else task.OUTPUT_TYPE - ) - # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks) - numpad = max(gathered_item) - gathered_item[lm.rank] - # todo: may not account for padding in cases like SquadV2 which has multiple req types - padding_requests[reqtype] += numpad - - ### Run LM on inputs, get all outputs ### - # execute each type of request - for reqtype, reqs in requests.items(): - eval_logger.info(f"Running {reqtype} requests") - # create `K` copies of each request `req` based off `K = req.repeats` - cloned_reqs = [] - for req in reqs: - cloned_reqs.extend([req] * req.repeats) - - if (lm.world_size > 1) and (padding_requests[reqtype] > 0): - for _ in range(padding_requests[reqtype]): - cloned_reqs.extend([req] * req.repeats) - - # run requests through model - resps = getattr(lm, reqtype)(cloned_reqs) - - # put responses from model into a list of length K for each request. - for x, req in zip(resps, cloned_reqs): - req.resps.append(x) - - if lm.world_size > 1: - lm.accelerator.wait_for_everyone() - - RANK = lm.rank - WORLD_SIZE = lm.world_size - ### Postprocess outputs ### - # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) - for task_output, limit in zip(eval_tasks, limits): - task = task_output.task - task.apply_filters() - - ### Collect values of metrics on all datapoints ### - # # unpack results and sort back in order and return control to Task - # TODO: make it possible to use a different metric per filter - # Pre-process task.instances to group by doc_id - instances_by_doc_id = defaultdict(list) - for instance in task.instances: - instances_by_doc_id[instance.doc_id].append(instance) - # Sort instances within each group - for instances in instances_by_doc_id.values(): - instances.sort(key=lambda x: x.idx) - # iterate over different filters used - for filter_key in task.instances[0].filtered_resps.keys(): - indices = ( - samples.get(task_output.task_name, None) - if samples is not None - else None - ) - doc_iterator = task.doc_iterator( - rank=RANK, - limit=limit, - world_size=WORLD_SIZE, - samples=indices, - ) - for doc_id, doc in doc_iterator: - if indices: - doc_id_true = indices[doc_id] - else: - doc_id_true = doc_id - requests = instances_by_doc_id[doc_id] - metrics = task.process_results( - doc, [req.filtered_resps[filter_key] for req in requests] - ) - if log_samples: - target = task.doc_to_target(doc) - example = { - "doc_id": doc_id_true, - "doc": doc, - "target": target, - "arguments": [req.args for req in requests], - "resps": [req.resps for req in requests], - "filtered_resps": [ - req.filtered_resps[filter_key] for req in requests - ], - "filter": filter_key, - "metrics": list(metrics.keys()), - "doc_hash": hash_string( - json.dumps( - requests[0].doc, - indent=2, - default=handle_non_serializable, - ensure_ascii=False, - ) - ), - "prompt_hash": hash_string(requests[0].arguments[0]), - "target_hash": hash_string(str(target)), - } - example.update(metrics) - task_output.logged_samples.append(example) - for metric, value in metrics.items(): - task_output.sample_metrics[(metric, filter_key)].append(value) - - if WORLD_SIZE > 1: - # if multigpu, then gather data across all ranks to rank 0 - # first gather logged samples across all ranks - for task_output in eval_tasks: - if log_samples: - # for task_name, task_samples in list(samples.items()): - full_samples = [None] * WORLD_SIZE if RANK == 0 else None - torch.distributed.gather_object( - obj=task_output.logged_samples, - object_gather_list=full_samples, - dst=0, - ) - - if RANK == 0: - task_output.logged_samples = list( - itertools.chain.from_iterable(full_samples) - ) - - # then collect metrics across all ranks - for metrics in task_output.sample_metrics: - metric_list = [None] * WORLD_SIZE if RANK == 0 else None - torch.distributed.gather_object( - obj=task_output.sample_metrics[metrics], - object_gather_list=metric_list, - dst=0, - ) - if RANK == 0: - task_output.sample_metrics[metrics] = list( - itertools.chain.from_iterable(metric_list) - ) - - if RANK == 0: - ### Aggregate results over all datapoints ### - # aggregate results ; run bootstrap CIs - for task_output in eval_tasks: - task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) - ( - results, - samples, - configs, - versions, - num_fewshot, - higher_is_better, - ) = consolidate_results(eval_tasks) - - ### Calculate group metrics ### - if bool(results): - results, versions, show_group_table, *_ = consolidate_group_results( - results, versions, task_dict - ) - - results_agg, group_agg = prepare_print_tasks(task_dict, results) - subtask_list = get_subtask_list(task_dict) - - # collect all higher_is_better values for metrics - # in the group's subtasks. - # TODO: clean this up ; unify with the below metric_list loop? - _higher_is_better = {} - for group, task_list in subtask_list.items(): - if ( - len(task_list) != 0 - ): # subtask list will list "task_name": [] for solo tasks - for task in task_list: - for m, h in higher_is_better[task].items(): - if m not in _higher_is_better.keys(): - _higher_is_better[m] = h - - if ( - m in _higher_is_better - and _higher_is_better[m] is not None - and _higher_is_better[m] != h - ): - eval_logger.warning( - f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None." - ) - _higher_is_better[m] = None - higher_is_better[group] = _higher_is_better - - results_dict = { - "results": dict(results_agg.items()), - **( - {"groups": dict(group_agg.items())} - if (bool(group_agg) & show_group_table) - else {} - ), - "group_subtasks": dict(reversed(subtask_list.items())), - "configs": dict(sorted(configs.items())), - "versions": dict(sorted(versions.items())), - "n-shot": dict(sorted(num_fewshot.items())), - "higher_is_better": dict(sorted(higher_is_better.items())), - "n-samples": { - task_output.task_name: { - "original": len(task_output.task.eval_docs), - "effective": min( - limit if limit else len(task_output.task.eval_docs), - len(task_output.task.eval_docs), - ), - } - for task_output, limit in zip(eval_tasks, limits) - }, - } - if log_samples: - results_dict["samples"] = dict(samples) - - return results_dict - - else: - return None - - -def request_caching_arg_to_dict(cache_requests: str) -> dict: - request_caching_args = { - "cache_requests": cache_requests in {"true", "refresh"}, - "rewrite_requests_cache": cache_requests == "refresh", - "delete_requests_cache": cache_requests == "delete", - } - - return request_caching_args diff --git a/lm-evaluation-harness/lm_eval/evaluator_utils.py b/lm-evaluation-harness/lm_eval/evaluator_utils.py deleted file mode 100644 index 0bd87b6c7a3923d7e2cdf71894ca87b10137ac9c..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/evaluator_utils.py +++ /dev/null @@ -1,554 +0,0 @@ -import collections -import logging -import math -import pathlib -import sys -from typing import List, Optional, Tuple, Union - -from lm_eval.api.group import ConfigurableGroup -from lm_eval.api.metrics import ( - aggregate_subtask_metrics, - mean, - pooled_sample_stderr, - stderr_for_metric, -) -from lm_eval.api.task import Task -from lm_eval.utils import positional_deprecated - - -eval_logger = logging.getLogger(__name__) - - -class TaskOutput: - """ - Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task. - - Attributes: - task (object): The task object. - task_name (str): The name of the task. - task_config (dict): The configuration of the task. - version (str): The version of the task. - group_name (str): The name of the task group. - n_shot (int): The number of shots for the task. - task_alias (str): The alias of the task. - group_alias (str): The alias of the task group. - is_group (bool): Indicates if the task is a group. - logged_samples (list): The list of logged samples. - sample_len (int): The length of the samples. - sample_metrics (defaultdict): The dictionary of samples' metrics. - agg_metrics (defaultdict): The dictionary of aggregate metrics. - - Methods: - from_taskdict(cls, task_name: str, task): - Creates a TaskOutput instance from a task dictionary. - - calculate_aggregate_metric(bootstrap_iters=100000) -> None: - Calculates the aggregate metrics for the task. - """ - - def __init__( - self, - task=None, - task_name=None, - task_config=None, - version=None, - group_name=None, - n_shot=None, - task_alias=None, - group_alias=None, - is_group=None, - ): - self.task = task - self.task_config = task_config - self.task_name = task_name - self.group_name = group_name - self.version = version - self.n_shot = n_shot - self.task_alias = task_alias - self.group_alias = group_alias - self.is_group = is_group - self.logged_samples = [] - self.sample_len = None - self.sample_metrics = collections.defaultdict(list) - self.agg_metrics = collections.defaultdict(list) - - @classmethod - def from_taskdict(cls, task_name: str, task): - if isinstance(task, tuple): - group_name, task = task - else: - group_name = None - if not task: - # these gets filtered out in get_task_list - # once they are added to group hierarchy - is_group = True - return cls( - task=task, task_name=task_name, is_group=is_group, group_name=group_name - ) - version = task.VERSION - task_config = dict(task.dump_config()) - if (n_shot := task_config.get("num_fewshot")) == 0: - n_shot = task_config.get("metadata", {}).get("num_fewshot", 0) - task_alias = task_config.get("alias") - group_alias = task_config.get("group_alias") - return cls( - task=task, - task_name=task_name, - task_config=task_config, - group_name=group_name, - version=version, - n_shot=n_shot, - task_alias=task_alias, - group_alias=group_alias, - ) - - def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None: - for (metric, filter_key), items in self.sample_metrics.items(): - try: - agg_fn = self.task.aggregation()[metric] - except KeyError: - # This is when process results output an arbitrary metric - # TODO: Handle this better and allow other aggregate functions other than mean. - agg_fn = mean - metric_key = f"{metric},{filter_key}" - self.agg_metrics[metric_key] = agg_fn(items) - self.sample_len = len(items) # TODO: same sample size for each metric? - if isinstance(bootstrap_iters, int): - stderr_fn = stderr_for_metric( - metric=agg_fn, - bootstrap_iters=min(bootstrap_iters, 100) - if metric in ["bleu", "chrf", "ter"] - else bootstrap_iters, - ) - self.agg_metrics[f"{metric}_stderr,{filter_key}"] = ( - stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A" - ) - else: - raise ValueError( - f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations." - ) - - def __repr__(self): - return ( - f"TaskOutput(task_name={self.task_name}, " - f"group_name={self.group_name}, " - f"version={self.version}, " - f"n_shot={self.n_shot}, " - f"task_alias={self.task_alias}, " - f"group_alias={self.group_alias})" - ) - - -def get_task_list(task_dict: dict) -> List[TaskOutput]: - outputs = [] - for task_name, task_obj in task_dict.items(): - if isinstance(task_obj, dict): - _outputs = get_task_list(task_obj) - outputs.extend(_outputs) - else: - task_output = TaskOutput.from_taskdict(task_name, task_obj) - outputs.append(task_output) - - return outputs - - -def get_subtask_list(task_dict, task_root=None, depth=0): - subtask_list = {} - for group_obj, task_obj in task_dict.items(): - if isinstance(group_obj, ConfigurableGroup): - # group_name = group_obj.group_name - group_name = group_obj.group_name - else: - group_name = group_obj - if isinstance(task_obj, dict): - _subtask_list = get_subtask_list( - task_obj, task_root=group_name, depth=depth + 1 - ) - if task_root: - subtask_list.setdefault((task_root, depth), []).extend( - [ - _task - for (_task, _depth) in _subtask_list.keys() - if (_depth - 1) == depth - ] - ) - - subtask_list = {**subtask_list, **_subtask_list} - else: - if isinstance(task_obj, ConfigurableGroup): - # group_or_task_name = task_obj.group_name - group_or_task_name = task_obj.group_name - elif isinstance(task_obj, Task): - # group_or_task_name = task_obj.task_name - group_or_task_name = task_obj.task_name - - if task_root is None: - subtask_list.setdefault((group_or_task_name, depth), []) - else: - subtask_list.setdefault((task_root, depth), []).append( - group_or_task_name - ) - - if depth == 0: - _subtask_list = {} - for group_key, task_list in subtask_list.items(): - group_name, depth = group_key - _subtask_list[group_name] = task_list - subtask_list = _subtask_list - - return subtask_list - - -def print_writeout(task) -> None: - for inst in task.instances: - # print the prompt for the first few documents - if inst.doc_id < 1: - eval_logger.info( - f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\ - \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)" - ) - eval_logger.info(f"Request: {str(inst)}") - - -def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: - if limit is not None: - limit = ( - int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit) - ) - return limit - - -def prepare_print_tasks( - task_dict: dict, - results: dict, - task_depth=0, - group_depth=0, -) -> Tuple[dict, dict]: - """ - @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its - value is a list of task names. - @param results: Dictionary containing the results of each task. Each key is a - group name and its value is a dictionary of task results. - @param task_depth: The indentation level for printing the task - hierarchy. Default is 0. - @param group_depth: The indentation level for printing the group - hierarchy. Default is 0. - @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains - aggregated results for each task, and groups_agg contains aggregated results for each group. - - Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. - """ - - def _sort_task_dict(task_dict): - """ - Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name. - Required so that we end up sorting within each sub-header correctly. - """ - - return dict( - sorted( - task_dict.items(), - key=lambda item: item[0].group_name - if isinstance(item[0], ConfigurableGroup) - else item[0], - ) - ) - - task_agg = collections.defaultdict(dict) - group_agg = collections.defaultdict(dict) - task_dict = _sort_task_dict(task_dict) - for task_or_group_name, task_or_group_obj in task_dict.items(): - tab_string = " " * task_depth + "- " if task_depth > 0 else "" - if isinstance(task_or_group_name, ConfigurableGroup): - # string_name = task_or_group_name.group_name - name = task_or_group_name.group_name - from_configurable_group = True - task_or_group_obj = _sort_task_dict(task_or_group_obj) - elif isinstance(task_or_group_name, str): - name = task_or_group_name - if isinstance(task_or_group_obj, Task): - # string_name = task_or_group_obj.task_name - name = task_or_group_obj.task_name - from_configurable_group = False - - task_agg[name] = results[name].copy() - if from_configurable_group: - if task_or_group_name.group_alias is not None: - alias = task_or_group_name.group_alias - else: - alias = task_or_group_name.group - else: - if "alias" in task_agg[name]: - alias = task_agg[name]["alias"] - else: - alias = name - - task_agg[name]["alias"] = tab_string + alias - if "samples" in task_agg[name]: - task_agg[name].pop("samples") - - if from_configurable_group and (" " not in results[name]): - group_tab_string = " " * group_depth + "- " if group_depth > 0 else "" - group_agg[name] = results[name].copy() - group_agg[name]["alias"] = group_tab_string + alias - if "samples" in group_agg[name]: - group_agg[name].pop("samples") - - if isinstance(task_or_group_obj, dict): - task_depth += 1 - group_depth += 1 - _task_agg, _group_agg = prepare_print_tasks( - task_or_group_obj, results, task_depth, group_depth - ) - task_agg = { - **task_agg, - **_task_agg, - } - group_agg = {**group_agg, **_group_agg} - task_depth -= 1 - group_depth -= 1 - return task_agg, group_agg - - -def consolidate_results( - eval_tasks: List[TaskOutput], -) -> Tuple[dict, dict, dict, dict, dict, dict]: - """ - @param eval_tasks: list(TaskOutput). - @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot. - - Consolidates the results of multiple evaluation tasks into a single structure. - - The method iterates over each evaluation instance and extracts relevant information to create the consolidated - results structure. The consolidated results structure has the following properties: - - - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains - metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task - aliases specified in the task configuration. - - samples: A defaultdict with task names as keys and lists of log samples as values. - - configs: A defaultdict with task names as keys and task configurations as values. - - versions: A defaultdict with task names as keys and task versions as values. - - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values. - - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better - for each metric as values. - - The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple. - """ - # stores the final result for each task, for each metric/filter pair. - results = collections.defaultdict(dict) - # logs info about each document evaluated. - samples = collections.defaultdict(list) - # store num-fewshot value per task - num_fewshot = collections.defaultdict(int) - # Tracks the YAML configs of all chosen task - configs = collections.defaultdict(dict) - # Tracks each task's version. - versions = collections.defaultdict(dict) - # Track `higher_is_better` for each metric - higher_is_better = collections.defaultdict(dict) - - for task_output in eval_tasks: - if "task_alias" in (task_config := task_output.task_config): - results[task_output.task_name]["alias"] = task_config["task_alias"] - else: - results[task_output.task_name]["alias"] = task_output.task_name - if group_alias := task_output.group_alias: - if group_alias not in results and (group_name := task_output.group_name): - results[group_name]["alias"] = group_alias - num_fewshot[task_output.task_name] = task_output.n_shot - configs[task_output.task_name] = task_output.task_config - versions[task_output.task_name] = task_output.version - samples[task_output.task_name] = task_output.logged_samples - higher_is_better[task_output.task_name] = task_output.task.higher_is_better() - for (metric, filter_key), items in task_output.sample_metrics.items(): - metric_key = f"{metric},{filter_key}" - results[task_output.task_name][metric_key] = task_output.agg_metrics[ - metric_key - ] - results[task_output.task_name]["samples"] = task_output.sample_len - results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = ( - task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] - ) - return results, samples, configs, versions, num_fewshot, higher_is_better - - -def consolidate_group_results( - results, - versions, - task_dict, - task_root=None, - show_group_table=False, - task_aggregation_list=None, -) -> Tuple[dict, dict, bool, Union[None,]]: - """ - (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info. - - @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below: - - - results: A defaultdict with task names (and, after this function is called, group names of - groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys. - - versions: A defaultdict with task names (and, after this function is called, group names of - groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None). - - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table. - - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric. - - The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple. - In the top-level invocation of this function, task_aggregation_list is ignored. - """ - if task_root is None: - task_root = {} - - if task_aggregation_list is None: - task_aggregation_list = {} - - for group_or_task, group_or_task_info in task_dict.items(): - # Convert to string - if isinstance(group_or_task, ConfigurableGroup): - group_config = group_or_task.config - group_or_task = group_or_task.group_name - else: - group_config = None - - if isinstance(group_or_task_info, Task): - if task_root: - task_aggregation_list.setdefault(task_root, []).append( - group_or_task_info.task_name - ) - else: - ( - results, - versions, - show_group_table, - _task_aggregation_list, - ) = consolidate_group_results( - results, - versions, - group_or_task_info, - group_or_task, - show_group_table, - task_aggregation_list, - ) - if task_root: - task_aggregation_list.setdefault(task_root, []).extend( - task_aggregation_list.get(group_or_task, []) - ) - - if (group_config is None) or ( - group_config["aggregate_metric_list"] is None - ): - results[group_or_task][" "] = " " - continue - - if "aggregate_metric_list" in group_config: - agg_metric_list = group_config["aggregate_metric_list"] - - show_group_table = show_group_table | bool( - group_config["aggregate_metric_list"] - ) - - task_list = _task_aggregation_list[group_or_task] - - metric_list = list( - { - key - for task in task_list - for key in results[task].keys() - if "_stderr" not in key and key not in ["task", "alias", "samples"] - } - ) - for metric in metric_list: - stderr = "_stderr,".join(metric.split(",")) - - # gather metrics, sizes, and stderrs from subtasks - metrics = [ - results[task][metric] - for task in task_list - if metric in results[task] - ] # TODO: copy? - stderrs = [ - results[task][stderr] - for task in task_list - if stderr in results[task] - ] - sizes = [ - results[task]["samples"] - for task in task_list - if metric in results[task] - ] - - for metric_config in agg_metric_list: - for filter_name in metric_config["filter_list"]: - if metric != ",".join([metric_config["metric"], filter_name]): - continue - - # compute group's pooled metric and stderr - if metric_config["aggregation"] == "mean": - aggregate_fn = aggregate_subtask_metrics - elif callable(metric_config["aggregation"]): - aggregate_fn = metric_config["aggregation"] - else: - raise ValueError( - f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'" - ) - - results[group_or_task][metric] = aggregate_fn( - metrics, - sizes, - metric_config["weight_by_size"], - ) - # TODO: calculate groups' metrics using arbitrary agg fns - if "N/A" in stderrs: - results[group_or_task][stderr] = "N/A" - else: - # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere - results[group_or_task][stderr] = pooled_sample_stderr( - stderrs, sizes - ) - - results[group_or_task]["samples"] = sum(sizes) - group_metadata = group_config.get("metadata", None) - if group_metadata is not None: - versions[group_or_task] = group_metadata.get("version", None) - # print(results) - return results, versions, show_group_table, task_aggregation_list - - -@positional_deprecated -def find_test_root(start_path: pathlib.Path) -> pathlib.Path: - """ - Search upward in the directory tree to a maximum of three layers - to find and return the package root (containing the 'tests' folder) - """ - cur_path = start_path.resolve() - max_layers = 3 - for _ in range(max_layers): - if (cur_path / "tests" / "test_version_stable.py").exists(): - return cur_path - else: - cur_path = cur_path.parent.resolve() - raise FileNotFoundError( - f"Unable to find package root within {max_layers} upwards" + f"of {start_path}" - ) - - -@positional_deprecated -def run_task_tests(task_list: List[str]): - """ - Find the package root and run the tests for the given tasks - """ - import pytest - - package_root = find_test_root(start_path=pathlib.Path(__file__)) - task_string = " or ".join(task_list) - args = [ - f"{package_root}/tests/test_version_stable.py", - f"--rootdir={package_root}", - "-k", - f"{task_string}", - ] - sys.path.append(str(package_root)) - pytest_return_val = pytest.main(args) - if pytest_return_val: - raise ValueError( - f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}" - ) diff --git a/lm-evaluation-harness/lm_eval/filters/__init__.py b/lm-evaluation-harness/lm_eval/filters/__init__.py deleted file mode 100644 index be5c9d43624ea901cc578c65689be5bd263209a5..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from functools import partial -from typing import List - -from lm_eval.api.filter import FilterEnsemble -from lm_eval.api.registry import get_filter - -from . import custom, extraction, selection, transformation - - -def build_filter_ensemble( - filter_name: str, components: List[List[str]] -) -> FilterEnsemble: - """ - Create a filtering pipeline. - """ - filters = [] - for function, kwargs in components: - if kwargs is None: - kwargs = {} - # create a filter given its name in the registry - f = partial(get_filter(function), **kwargs) - # add the filter as a pipeline step - filters.append(f) - - return FilterEnsemble(name=filter_name, filters=filters) diff --git a/lm-evaluation-harness/lm_eval/filters/custom.py b/lm-evaluation-harness/lm_eval/filters/custom.py deleted file mode 100644 index ab22c51eda74670aaea6699fc68992994c41932d..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/custom.py +++ /dev/null @@ -1,17 +0,0 @@ -from lm_eval.api.filter import Filter -from lm_eval.api.registry import register_filter - - -@register_filter("custom") -class CustomFilter(Filter): - """ - Custom filter that applies a custom, user-defined function to the model responses. - """ - - def __init__(self, **kwargs) -> None: - self.filter_fn = kwargs.pop("filter_fn") - - super().__init__(**kwargs) - - def apply(self, resps, docs): - return self.filter_fn(resps, docs) diff --git a/lm-evaluation-harness/lm_eval/filters/decontamination.py b/lm-evaluation-harness/lm_eval/filters/decontamination.py deleted file mode 100644 index 4eda4e022445355f191926790b2edf8f0cfa4bbd..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/decontamination.py +++ /dev/null @@ -1,25 +0,0 @@ -from lm_eval.api.filter import Filter -from lm_eval.api.registry import register_filter - - -@register_filter("decontaminate") -class DecontaminationFilter(Filter): - """ - A filter which evaluates - """ - - name = "track_decontamination" - - def __init__(self, path) -> None: - """ - - TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). - should further cache result on a given (task_name, doc_id) - """ - self._decontam_results = None - - def apply(self, resps, docs) -> None: - """ - Return {"no_contamination", "only_contamination"} keys for the 2 different subsets - """ - pass diff --git a/lm-evaluation-harness/lm_eval/filters/extraction.py b/lm-evaluation-harness/lm_eval/filters/extraction.py deleted file mode 100644 index 22ca883a9d00b2156c6aedc5df7448879a03da65..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/extraction.py +++ /dev/null @@ -1,233 +0,0 @@ -import re -import sys -import unicodedata - -from lm_eval.api.filter import Filter -from lm_eval.api.registry import register_filter - - -@register_filter("regex") -class RegexFilter(Filter): - """A filter that extracts values from text using regex pattern matching. - - This filter applies a regex pattern to each model response and extracts matched values. - If no match is found, returns a fallback value. Useful for extracting structured data - (like numbers) from unstructured model outputs. - """ - - def __init__( - self, - regex_pattern: str = r"#### (\-?[0-9\.\,]+)", - group_select: int = 0, - fallback: str = "[invalid]", - ) -> None: - """ - pass a string `regex` to run `re.compile(r"regex")` on. - `fallback` defines the output returned if no matches for the regex are located. - """ - self.regex_pattern = regex_pattern - self.regex = re.compile(regex_pattern) - self.group_select = group_select - self.fallback = fallback - - def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]: - # here, we assume we have a list, in which each element is - # a list of model responses for some particular input/target pair. - # so we process each of these (same input/target response sets) - # independently (and keep them a list.) - def filter_set(inst): - filtered = [] - for resp in inst: - match = self.regex.findall(resp) - if match: - match = match[self.group_select] - if isinstance(match, tuple): - match = [m for m in match if m] - if match: - match = match[0] - else: - match = self.fallback - match = match.strip() - else: - match = self.fallback - filtered.append(match) - return filtered - - filtered_resps = list(map(lambda x: filter_set(x), resps)) - return filtered_resps - - -@register_filter("regex_pos") -class POSFilter(Filter): - """ """ - - def __init__( - self, - regex_pattern: str = r"\['(.*?)'\]", - group_select=0, - fallback=None, - ) -> None: - """ - pass a string `regex` to run `re.compile(r"regex")` on. - `fallback` defines the output returned if no matches for the regex are located. - """ - if fallback is None: - fallback = ["invalid"] - self.regex_pattern = regex_pattern - self.regex = re.compile(regex_pattern) - self.group_select = group_select - self.fallback = fallback - - def apply(self, resps, docs): - def extract_tagged_tokens(text): - # Extract tagged tokens list from text input using regex - tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text) - return [(token, pos) for token, pos in tokens] - - def extract_pos_tags(result): - pos_tags = [] - if isinstance(result, str): - result = extract_tagged_tokens(result) - pos_tags.extend(pos for _, pos in result) - return pos_tags if pos_tags else self.fallback - - def filter_set(inst): - filtered = [] - for resp in inst: - match = extract_pos_tags(resp) - filtered.append(match) - return filtered - - filtered_resps = map(lambda x: filter_set(x), resps) - - return filtered_resps - - -@register_filter("remove_whitespace") -class WhitespaceFilter(Filter): - """Filters out leading whitespace from responses.""" - - def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]: - def filter_set(inst): - filtered_resp = [] - for resp in inst: - resp = resp.lstrip() - filtered_resp.append(resp) - return filtered_resp - - filtered_resps = [filter_set(resp) for resp in resps] - - return filtered_resps - - -@register_filter("multi_choice_regex") -class MultiChoiceRegexFilter(RegexFilter): - """ - A filter used to extract a model's answer on multiple choice questions with - letter answers. assumes each document has a "choices" field - containing the list of answer choices and that the answer label symbols - are of the form (A), (B), (C), ... or A, B, C. - """ - - def __init__( - self, - regex_pattern: str = r"#### (\-?[0-9\.\,]+)", - group_select=0, - fallback: str = "[invalid]", - ignore_case=False, - ignore_punctuation=False, - regexes_to_ignore=None, - ) -> None: - """ - regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure - - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. - - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. - group_select: Selects the (group_select)th match from the findall result. - ignore_case: Ignores the case during step 1 matching - ignore_punctuation: Remove the punctuation during step 1 matching - regexes_to_ignore: Remove these regexes during step 1 matching - """ - super().__init__(regex_pattern, group_select, fallback) - self.ignore_case = ignore_case - self.ignore_punctuation = ignore_punctuation - self.regexes_to_ignore = regexes_to_ignore - - def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]: - # here, we assume we have a list, in which each element is - # a list of model responses for some particular input/target pair. - # so we process each of these (same input/target response sets) - # independently (and keep them a list.) - - def find_match(regex, resp, convert_dict={}): - match = regex.findall(resp) - if match: - match = match[self.group_select] - if isinstance(match, tuple): - match = [m for m in match if m][0] - match = match.strip() - if match and match in convert_dict: - match = convert_dict[match] - return match - - punct_tbl = dict.fromkeys( - i - for i in range(sys.maxunicode) - if unicodedata.category(chr(i)).startswith("P") - ) - - def filter_ignores(st): - if self.regexes_to_ignore is not None: - for s in self.regexes_to_ignore: - st = re.sub(s, "", st) - - if self.ignore_case: - st = st.lower() - - if self.ignore_punctuation: - # https://stackoverflow.com/a/266162 - st = st.translate(punct_tbl) - return st - - filtered_resps = [] - - for r, doc in zip(resps, docs): - fallback_regexes = [] - choice_to_alpha = {} - next_alpha = "A" - - without_paren_fallback_regexes = [] - without_paren_to_target = {} - - choices = doc["choices"] - for c in choices: - m = filter_ignores(c.strip()) - fallback_regexes.append(f"{re.escape(m)}") - choice_to_alpha[m] = f"({next_alpha})" - - without_paren_fallback_regexes.append(next_alpha) - without_paren_to_target[next_alpha] = f"({next_alpha})" - - next_alpha = chr(ord(next_alpha) + 1) - fallback_regex = re.compile("|".join(fallback_regexes)) - without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) - without_paren_fallback_regex = re.compile( - rf":[\s]*({without_paren_fallback_regex})" - ) - - filtered = [] - for resp in r: - match = find_match(self.regex, resp) - if not match: - match = find_match( - fallback_regex, filter_ignores(resp), choice_to_alpha - ) - if not match: - match = find_match( - without_paren_fallback_regex, resp, without_paren_to_target - ) - if not match: - match = self.fallback - filtered.append(match) - filtered_resps.append(filtered) - - return filtered_resps diff --git a/lm-evaluation-harness/lm_eval/filters/selection.py b/lm-evaluation-harness/lm_eval/filters/selection.py deleted file mode 100644 index 8c670ed74d00655441cc45181fba1265f0db5290..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/selection.py +++ /dev/null @@ -1,61 +0,0 @@ -from collections import Counter - -from lm_eval.api.filter import Filter -from lm_eval.api.registry import register_filter - - -# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function -# that takes an input and returns a scalar and then should select the max reward, -# or should implement different filters for different ways of handling a reward model's inference. - - -@register_filter("take_first") -class TakeFirstFilter(Filter): - def __init__(self) -> None: - """ - Can define custom behavior here, if an individual instantiation of a Filter class should have state. - """ - - def apply(self, resps, docs): - """ - Assuming each entry of `resps` is a list of model responses, we discard all but the first response. - """ - return map(lambda r: r[0], resps) - - -@register_filter("take_first_k") -class TakeKFilter(Filter): - def __init__(self, **kwargs) -> None: - self.k = kwargs.pop("k") - - super().__init__(**kwargs) - - def apply(self, resps, docs): - # need resp to be subscriptable to check below - resps = list(resps) - # check we have at least k responses per doc, else we can't take the first k - assert len(resps[0]) >= self.k, ( - f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." - ) - return map(lambda r: r[: self.k], resps) - - -@register_filter("majority_vote") -class MajorityVoteFilter(Filter): - def __init__(self) -> None: - """ - Can define custom behavior here, if an individual instantiation of a Filter class should have state. - """ - - def apply(self, resps, docs): - """ - Each entry of `resps` is a list of model responses. - We select the response that occurs most frequently in each entry of `resps`. - """ - - def select_majority(resp): - counts = Counter(resp) - vote = counts.most_common(1)[0][0] - return vote - - return map(lambda r: [select_majority(r)], resps) diff --git a/lm-evaluation-harness/lm_eval/filters/transformation.py b/lm-evaluation-harness/lm_eval/filters/transformation.py deleted file mode 100644 index 722c67403c8adbc499283a611df17eb1743307b8..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/filters/transformation.py +++ /dev/null @@ -1,122 +0,0 @@ -import re - -from lm_eval.api.filter import Filter -from lm_eval.api.registry import register_filter - - -@register_filter("lowercase") -class LowercaseFilter(Filter): - def __init__(self) -> None: - pass - - def apply(self, resps, docs): - def filter_set(inst): - return [resp.lower() for resp in inst] - - return [filter_set(resp) for resp in resps] - - -@register_filter("uppercase") -class UppercaseFilter(Filter): - def __init__(self) -> None: - pass - - def apply(self, resps, docs): - def filter_set(inst): - return [resp.upper() for resp in inst] - - return [filter_set(resp) for resp in resps] - - -@register_filter("map") -class MapFilter(Filter): - def __init__(self, mapping_dict: dict = None, default_value=None) -> None: - """ - Initializes the MapFilter with a given mapping dictionary and default value. - - Args: - - mapping_dict (dict): A dictionary containing the key-value mappings. - Default is an empty dictionary. - - default_value (Any): The value to be returned when a key is not found in the mapping_dict. - Default is None. - - Example: - mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) - """ - if mapping_dict is None: - mapping_dict = {} - assert isinstance(mapping_dict, dict), ( - "Provided mapping_dict is not a dictionary" - ) - self.mapping_dict = mapping_dict - self.default_value = default_value - - def apply(self, resps, docs): - def filter_set(inst): - return [self.mapping_dict.get(resp, self.default_value) for resp in inst] - - return [filter_set(resp) for resp in resps] - - -@register_filter("format_span") -class SPANFilter(Filter): - def __init__(self) -> None: - pass - - def apply(self, resps, docs): - def format_ner_text(text): - label_dict = { - "person": "PER", - "location": "LOC", - "organization": "ORG", - "counties": "LOC", - "places": "LOC", - "people": "PER", - "persons": "PER", - "company": "ORG", - "country": "LOC", - "continent": "LOC", - "time": "DATE", - "date": "DATE", - "per": "PER", - "loc": "LOC", - "org": "ORG", - } - text = text.lower() - for key, value in label_dict.items(): - text = text.replace(key, value) - - text = "$".join(i for i in text.split("$$")) - return text.rstrip("$$") - - def format_named_entities(text): - """ - Extract named entities from text and format them as 'label: value $$ label: value'. - Handles grouped entities (e.g., LOC: kenya, uganda) and excludes 'none' values. - """ - # Regular expression to match label: entities pattern - pattern = r"\b(PER|LOC|ORG|DATE):\s*([^$]+)" - # Normalize newline characters - text = text.replace("\n", "$").strip() - matches = re.findall(pattern, text) - - formatted_entities = [] - - for label, values in matches: - # Split multiple entities separated by commas and strip whitespace - entities = [value.strip() for value in values.split(",")] - - # Exclude 'none' entities - for entity in entities: - if entity.lower() != "none": - formatted_entities.append(f"{label.lower()}: {entity}") - - # Join entities with the desired separator - return " $ ".join(formatted_entities) - - def filter_set(inst): - return [ - format_named_entities(format_ner_text(resp.lower())) for resp in inst - ] - - return [filter_set(resp) for resp in resps] diff --git a/lm-evaluation-harness/lm_eval/loggers/__init__.py b/lm-evaluation-harness/lm_eval/loggers/__init__.py deleted file mode 100644 index 02b7a6834c6486fde35ef02d715e90be3fba223a..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/loggers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .evaluation_tracker import EvaluationTracker -from .wandb_logger import WandbLogger diff --git a/lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py b/lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py deleted file mode 100644 index 634a62577439e89d7df23f93511f44e327e7f38e..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py +++ /dev/null @@ -1,537 +0,0 @@ -import json -import logging -import os -import re -import time -from collections import defaultdict -from dataclasses import asdict, dataclass -from datetime import datetime -from pathlib import Path - -from datasets import load_dataset -from datasets.utils.metadata import MetadataConfigs -from huggingface_hub import ( - DatasetCard, - DatasetCardData, - HfApi, - hf_hub_url, -) -from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status - -from lm_eval.utils import ( - get_file_datetime, - get_file_task_name, - get_results_filenames, - get_sample_results_filenames, - handle_non_serializable, - hash_string, - sanitize_list, - sanitize_model_name, - sanitize_task_name, -) - - -eval_logger = logging.getLogger(__name__) - - -@dataclass(init=False) -class GeneralConfigTracker: - """ - Tracker for the evaluation parameters. - - Attributes: - model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.) - model_name (str): Name of the model. - model_name_sanitized (str): Sanitized model name for directory creation. - start_time (float): Start time of the experiment. Logged at class init. - end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`] - total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times). - """ - - model_source: str = None - model_name: str = None - model_name_sanitized: str = None - system_instruction: str = None - system_instruction_sha: str = None - fewshot_as_multiturn: bool = None - chat_template: str = None - chat_template_sha: str = None - start_time: float = None - end_time: float = None - total_evaluation_time_seconds: str = None - - def __init__(self) -> None: - """Starts the evaluation timer.""" - self.start_time = time.perf_counter() - - @staticmethod - def _get_model_name(model_args: str) -> str: - """Extracts the model name from the model arguments.""" - - def extract_model_name(model_args: str, key: str) -> str: - """Extracts the model name from the model arguments using a key.""" - args_after_key = model_args.split(key)[1] - return args_after_key.split(",")[0] - - # order does matter, e.g. peft and delta are provided together with pretrained - prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="] - for prefix in prefixes: - if prefix in model_args: - return extract_model_name(model_args, prefix) - return "" - - def log_experiment_args( - self, - model_source: str, - model_args: str, - system_instruction: str, - chat_template: str, - fewshot_as_multiturn: bool, - ) -> None: - """Logs model parameters and job ID.""" - self.model_source = model_source - self.model_name = GeneralConfigTracker._get_model_name(model_args) - self.model_name_sanitized = sanitize_model_name(self.model_name) - self.system_instruction = system_instruction - self.system_instruction_sha = ( - hash_string(system_instruction) if system_instruction else None - ) - self.chat_template = chat_template - self.chat_template_sha = hash_string(chat_template) if chat_template else None - self.fewshot_as_multiturn = fewshot_as_multiturn - - def log_end_time(self) -> None: - """Logs the end time of the evaluation and calculates the total evaluation time.""" - self.end_time = time.perf_counter() - self.total_evaluation_time_seconds = str(self.end_time - self.start_time) - - -class EvaluationTracker: - """ - Keeps track and saves relevant information of the evaluation process. - Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested. - """ - - def __init__( - self, - output_path: str = None, - hub_results_org: str = "", - hub_repo_name: str = "", - details_repo_name: str = "", - results_repo_name: str = "", - push_results_to_hub: bool = False, - push_samples_to_hub: bool = False, - public_repo: bool = False, - token: str = "", - leaderboard_url: str = "", - point_of_contact: str = "", - gated: bool = False, - ) -> None: - """ - Creates all the necessary loggers for evaluation tracking. - - Args: - output_path (str): Path to save the results. If not provided, the results won't be saved. - hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token. - hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`. - details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`. - result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo. - push_results_to_hub (bool): Whether to push the results to the Hugging Face hub. - push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub. - public_repo (bool): Whether to push the results to a public or private repository. - token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`. - leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card. - point_of_contact (str): Contact information on the Hugging Face hub dataset card. - gated (bool): Whether to gate the repository. - """ - self.general_config_tracker = GeneralConfigTracker() - - self.output_path = output_path - self.push_results_to_hub = push_results_to_hub - self.push_samples_to_hub = push_samples_to_hub - self.public_repo = public_repo - self.leaderboard_url = leaderboard_url - self.point_of_contact = point_of_contact - self.api = HfApi(token=token) if token else None - self.gated_repo = gated - - if not self.api and (push_results_to_hub or push_samples_to_hub): - raise ValueError( - "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. " - "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable." - ) - - if ( - self.api - and hub_results_org == "" - and (push_results_to_hub or push_samples_to_hub) - ): - hub_results_org = self.api.whoami()["name"] - eval_logger.warning( - f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'." - ) - - if hub_repo_name == "": - details_repo_name = ( - details_repo_name if details_repo_name != "" else "lm-eval-results" - ) - results_repo_name = ( - results_repo_name if results_repo_name != "" else details_repo_name - ) - else: - details_repo_name = hub_repo_name - results_repo_name = hub_repo_name - eval_logger.warning( - "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead." - ) - - self.details_repo = f"{hub_results_org}/{details_repo_name}" - self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private" - self.results_repo = f"{hub_results_org}/{results_repo_name}" - self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private" - - def save_results_aggregated( - self, - results: dict, - samples: dict, - ) -> None: - """ - Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested. - - Args: - results (dict): The aggregated results to save. - samples (dict): The samples results to save. - """ - self.general_config_tracker.log_end_time() - - if self.output_path: - try: - eval_logger.info("Saving results aggregated") - - # calculate cumulative hash for each task - only if samples are provided - task_hashes = {} - if samples: - for task_name, task_samples in samples.items(): - sample_hashes = [ - s["doc_hash"] + s["prompt_hash"] + s["target_hash"] - for s in task_samples - ] - task_hashes[task_name] = hash_string("".join(sample_hashes)) - - # update initial results dict - results.update({"task_hashes": task_hashes}) - results.update(asdict(self.general_config_tracker)) - dumped = json.dumps( - results, - indent=2, - default=handle_non_serializable, - ensure_ascii=False, - ) - - path = Path(self.output_path if self.output_path else Path.cwd()) - self.date_id = datetime.now().isoformat().replace(":", "-") - if path.suffix == ".json": - path.parent.mkdir(parents=True, exist_ok=True) - file_results_aggregated = path.with_name( - f"{path.stem}_{self.date_id}.json" - ) - else: - path = path.joinpath( - self.general_config_tracker.model_name_sanitized - ) - path.mkdir(parents=True, exist_ok=True) - file_results_aggregated = path.joinpath( - f"results_{self.date_id}.json" - ) - - file_results_aggregated.open("w", encoding="utf-8").write(dumped) - - if self.api and self.push_results_to_hub: - repo_id = ( - self.results_repo - if self.public_repo - else self.results_repo_private - ) - self.api.create_repo( - repo_id=repo_id, - repo_type="dataset", - private=not self.public_repo, - exist_ok=True, - ) - self.api.upload_file( - repo_id=repo_id, - path_or_fileobj=str(file_results_aggregated), - path_in_repo=os.path.join( - self.general_config_tracker.model_name, - file_results_aggregated.name, - ), - repo_type="dataset", - commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", - ) - eval_logger.info( - "Successfully pushed aggregated results to the Hugging Face Hub. " - f"You can find them at: {repo_id}" - ) - - except Exception as e: - eval_logger.warning("Could not save results aggregated") - eval_logger.info(repr(e)) - else: - eval_logger.info( - "Output path not provided, skipping saving results aggregated" - ) - - def save_results_samples( - self, - task_name: str, - samples: dict, - ) -> None: - """ - Saves the samples results to the output path and pushes them to the Hugging Face hub if requested. - - Args: - task_name (str): The task name to save the samples for. - samples (dict): The samples results to save. - """ - if self.output_path: - try: - eval_logger.info(f"Saving per-sample results for: {task_name}") - - path = Path(self.output_path if self.output_path else Path.cwd()) - if path.suffix == ".json": - path = path.parent - else: - path = path.joinpath( - self.general_config_tracker.model_name_sanitized - ) - path.mkdir(parents=True, exist_ok=True) - - file_results_samples = path.joinpath( - f"samples_{task_name}_{self.date_id}.jsonl" - ) - - for sample in samples: - # we first need to sanitize arguments and resps - # otherwise we won't be able to load the dataset - # using the datasets library - arguments = {} - for i, arg in enumerate(sample["arguments"]): - arguments[f"gen_args_{i}"] = {} - for j, tmp in enumerate(arg): - arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp - - sample["resps"] = sanitize_list(sample["resps"]) - sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) - sample["arguments"] = arguments - sample["target"] = str(sample["target"]) - - sample_dump = ( - json.dumps( - sample, - default=handle_non_serializable, - ensure_ascii=False, - ) - + "\n" - ) - - with open(file_results_samples, "a", encoding="utf-8") as f: - f.write(sample_dump) - - if self.api and self.push_samples_to_hub: - repo_id = ( - self.details_repo - if self.public_repo - else self.details_repo_private - ) - self.api.create_repo( - repo_id=repo_id, - repo_type="dataset", - private=not self.public_repo, - exist_ok=True, - ) - try: - if self.gated_repo: - headers = build_hf_headers() - r = get_session().put( - url=f"https://huggingface.co/api/datasets/{repo_id}/settings", - headers=headers, - json={"gated": "auto"}, - ) - hf_raise_for_status(r) - except Exception as e: - eval_logger.warning("Could not gate the repository") - eval_logger.info(repr(e)) - self.api.upload_folder( - repo_id=repo_id, - folder_path=str(path), - path_in_repo=self.general_config_tracker.model_name_sanitized, - repo_type="dataset", - commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}", - ) - eval_logger.info( - f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. " - f"You can find them at: {repo_id}" - ) - - except Exception as e: - eval_logger.warning("Could not save sample results") - eval_logger.info(repr(e)) - else: - eval_logger.info("Output path not provided, skipping saving sample results") - - def recreate_metadata_card(self) -> None: - """ - Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub. - """ - - eval_logger.info("Recreating metadata card") - repo_id = self.details_repo if self.public_repo else self.details_repo_private - - files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") - results_files = get_results_filenames(files_in_repo) - sample_files = get_sample_results_filenames(files_in_repo) - - # Build a dictionary to store the latest evaluation datetime for: - # - Each tested model and its aggregated results - # - Each task and sample results, if existing - # i.e. { - # "org__model_name__gsm8k": "2021-09-01T12:00:00", - # "org__model_name__ifeval": "2021-09-01T12:00:00", - # "org__model_name__results": "2021-09-01T12:00:00" - # } - latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat()) - - for file_path in sample_files: - file_path = Path(file_path) - filename = file_path.name - model_name = file_path.parent - task_name = get_file_task_name(filename) - results_datetime = get_file_datetime(filename) - task_name_sanitized = sanitize_task_name(task_name) - # Results and sample results for the same model and task will have the same datetime - samples_key = f"{model_name}__{task_name_sanitized}" - results_key = f"{model_name}__results" - latest_datetime = max( - latest_task_results_datetime[samples_key], - results_datetime, - ) - latest_task_results_datetime[samples_key] = latest_datetime - latest_task_results_datetime[results_key] = max( - latest_task_results_datetime[results_key], - latest_datetime, - ) - - # Create metadata card - card_metadata = MetadataConfigs() - - # Add the latest aggregated results to the metadata card for easy access - for file_path in results_files: - file_path = Path(file_path) - results_filename = file_path.name - model_name = file_path.parent - eval_date = get_file_datetime(results_filename) - eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date) - results_filename = Path("**") / Path(results_filename).name - config_name = f"{model_name}__results" - sanitized_last_eval_date_results = re.sub( - r"[^\w\.]", "_", latest_task_results_datetime[config_name] - ) - - if eval_date_sanitized == sanitized_last_eval_date_results: - # Ensure that all results files are listed in the metadata card - current_results = card_metadata.get(config_name, {"data_files": []}) - current_results["data_files"].append( - {"split": eval_date_sanitized, "path": [str(results_filename)]} - ) - card_metadata[config_name] = current_results - # If the results file is the newest, update the "latest" field in the metadata card - card_metadata[config_name]["data_files"].append( - {"split": "latest", "path": [str(results_filename)]} - ) - - # Add the tasks details configs - for file_path in sample_files: - file_path = Path(file_path) - filename = file_path.name - model_name = file_path.parent - task_name = get_file_task_name(filename) - eval_date = get_file_datetime(filename) - task_name_sanitized = sanitize_task_name(task_name) - eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date) - results_filename = Path("**") / Path(filename).name - config_name = f"{model_name}__{task_name_sanitized}" - sanitized_last_eval_date_results = re.sub( - r"[^\w\.]", "_", latest_task_results_datetime[config_name] - ) - if eval_date_sanitized == sanitized_last_eval_date_results: - # Ensure that all sample results files are listed in the metadata card - current_details_for_task = card_metadata.get( - config_name, {"data_files": []} - ) - current_details_for_task["data_files"].append( - {"split": eval_date_sanitized, "path": [str(results_filename)]} - ) - card_metadata[config_name] = current_details_for_task - # If the samples results file is the newest, update the "latest" field in the metadata card - card_metadata[config_name]["data_files"].append( - {"split": "latest", "path": [str(results_filename)]} - ) - - # Get latest results and extract info to update metadata card examples - latest_datetime = max(latest_task_results_datetime.values()) - latest_model_name = max( - latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k] - ) - last_results_file = [ - f for f in results_files if latest_datetime.replace(":", "-") in f - ][0] - last_results_file_path = hf_hub_url( - repo_id=repo_id, filename=last_results_file, repo_type="dataset" - ) - latest_results_file = load_dataset( - "json", data_files=last_results_file_path, split="train" - ) - results_dict = latest_results_file["results"][0] - new_dictionary = {"all": results_dict} - new_dictionary.update(results_dict) - results_string = json.dumps(new_dictionary, indent=4) - - dataset_summary = ( - "Dataset automatically created during the evaluation run of model " - ) - if self.general_config_tracker.model_source == "hf": - dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n" - else: - dataset_summary += f"{self.general_config_tracker.model_name}\n" - dataset_summary += ( - f"The dataset is composed of {len(card_metadata) - 1} configuration(s), each one corresponding to one of the evaluated task.\n\n" - f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each " - 'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n' - 'An additional configuration "results" store all the aggregated results of the run.\n\n' - "To load the details from a run, you can for instance do the following:\n" - ) - if self.general_config_tracker.model_source == "hf": - dataset_summary += ( - "```python\nfrom datasets import load_dataset\n" - f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n' - ) - dataset_summary += ( - "## Latest results\n\n" - f"These are the [latest results from run {latest_datetime}]({last_results_file_path.replace('/resolve/', '/blob/')}) " - "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. " - 'You find each in the results and the "latest" split for each eval):\n\n' - f"```python\n{results_string}\n```" - ) - card_data = DatasetCardData( - dataset_summary=dataset_summary, - repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}", - pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}", - leaderboard_url=self.leaderboard_url, - point_of_contact=self.point_of_contact, - ) - card_metadata.to_dataset_card_data(card_data) - card = DatasetCard.from_template( - card_data, - pretty_name=card_data.pretty_name, - ) - card.push_to_hub(repo_id, repo_type="dataset") diff --git a/lm-evaluation-harness/lm_eval/loggers/utils.py b/lm-evaluation-harness/lm_eval/loggers/utils.py deleted file mode 100644 index c2640953603635d2d96dd85bde9ef14b0175cac2..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/loggers/utils.py +++ /dev/null @@ -1,149 +0,0 @@ -import logging -import os -import re -import subprocess -from importlib.metadata import version -from pathlib import Path -from typing import Any, Dict, Optional, Tuple, Union - -import numpy as np -from torch.utils.collect_env import get_pretty_env_info -from transformers import __version__ as trans_version - - -logger = logging.getLogger(__name__) - - -def remove_none_pattern(input_string: str) -> Tuple[str, bool]: - """Remove the ',none' substring from the input_string if it exists at the end. - - Args: - input_string (str): The input string from which to remove the ',none' substring. - - Returns: - Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed - and a boolean indicating whether the modification was made (True) or not (False). - """ - # Define the pattern to match ',none' at the end of the string - pattern = re.compile(r",none$") - - # Use sub() to replace ',none' with an empty string - result = re.sub(pattern, "", input_string) - - # check if the input_string changed - removed = result != input_string - - return result, removed - - -def _handle_non_serializable(o: Any) -> Union[int, str, list]: - """Handle non-serializable objects by converting them to serializable types. - - Args: - o (Any): The object to be handled. - - Returns: - Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32, - it will be converted to int. If the object is of type set, it will be converted - to a list. Otherwise, it will be converted to str. - """ - if isinstance(o, np.int64) or isinstance(o, np.int32): - return int(o) - elif isinstance(o, set): - return list(o) - else: - return str(o) - - -def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]: - try: - git_folder = Path(repo_path, ".git") - if git_folder.is_file(): - git_folder = Path( - git_folder.parent, - git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1], - ) - if Path(git_folder, "HEAD").exists(): - head_name = ( - Path(git_folder, "HEAD") - .read_text(encoding="utf-8") - .split("\n")[0] - .split(" ")[-1] - ) - head_ref = Path(git_folder, head_name) - git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "") - else: - git_hash = None - except Exception as err: - logger.debug( - f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}" - ) - return None - return git_hash - - -def get_git_commit_hash(): - """ - Gets the git commit hash of your current repo (if it exists). - Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42 - """ - try: - git_hash = subprocess.check_output(["git", "describe", "--always"]).strip() - git_hash = git_hash.decode() - except (subprocess.CalledProcessError, FileNotFoundError): - # FileNotFoundError occurs when git not installed on system - git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists - return git_hash - - -def add_env_info(storage: Dict[str, Any]): - try: - pretty_env_info = get_pretty_env_info() - except Exception as err: - pretty_env_info = str(err) - try: - lm_eval_version = version("lm_eval") - except Exception as err: - lm_eval_version = str(err) - transformers_version = trans_version - upper_dir_commit = get_commit_from_path( - Path(os.getcwd(), "..") - ) # git hash of upper repo if exists - added_info = { - "pretty_env_info": pretty_env_info, - "transformers_version": transformers_version, - "lm_eval_version": lm_eval_version, - "upper_git_hash": upper_dir_commit, # in case this repo is submodule - } - storage.update(added_info) - - -def add_tokenizer_info(storage: Dict[str, Any], lm): - if getattr(lm, "tokenizer", False): - try: - tokenizer_info = { - "tokenizer_pad_token": [ - lm.tokenizer.pad_token, - str(lm.tokenizer.pad_token_id), - ], - "tokenizer_eos_token": [ - lm.tokenizer.eos_token, - str(lm.tokenizer.eos_token_id), - ], - "tokenizer_bos_token": [ - lm.tokenizer.bos_token, - str(lm.tokenizer.bos_token_id), - ], - "eot_token_id": getattr(lm, "eot_token_id", None), - "max_length": getattr(lm, "max_length", None), - } - storage.update(tokenizer_info) - except Exception as err: - logger.debug( - f"Logging detailed tokenizer info failed with {err}, skipping..." - ) - # seems gguf and textsynth do not have tokenizer - else: - logger.debug( - "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results." - ) diff --git a/lm-evaluation-harness/lm_eval/loggers/wandb_logger.py b/lm-evaluation-harness/lm_eval/loggers/wandb_logger.py deleted file mode 100644 index 8795f5d8343143ac3badcde98e536c003a1a3fb8..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/loggers/wandb_logger.py +++ /dev/null @@ -1,358 +0,0 @@ -import copy -import json -import logging -from typing import Any, Dict, List, Literal, Tuple - -import numpy as np -import pandas as pd -from packaging.version import Version - -from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern - - -logger = logging.getLogger(__name__) - - -def get_wandb_printer() -> Literal["Printer"]: - """Returns a wandb printer instance for pretty stdout.""" - from wandb.sdk.lib.printer import new_printer - - printer = new_printer() - return printer - - -class WandbLogger: - def __init__(self, init_args=None, config_args=None) -> None: - """Attaches to wandb logger if already initialized. Otherwise, passes init_args to wandb.init() and config_args to wandb.config.update() - - Args: - init_args Optional[Dict]: Arguments for init configuration. - config_args Optional[Dict]: Arguments for config - - Parse and log the results returned from evaluator.simple_evaluate() with: - wandb_logger.post_init(results) - wandb_logger.log_eval_result() - wandb_logger.log_eval_samples(results["samples"]) - """ - try: - import wandb - - assert Version(wandb.__version__) >= Version("0.13.6") - if Version(wandb.__version__) < Version("0.13.6"): - wandb.require("report-editing:v0") - except Exception as e: - logger.warning( - "To use the wandb reporting functionality please install wandb>=0.13.6.\n" - "To install the latest version of wandb run `pip install wandb --upgrade`\n" - f"{e}" - ) - - self.wandb_args: Dict[str, Any] = init_args or {} - self.wandb_config_args: Dict[str, Any] = config_args or {} - - # pop the step key from the args to save for all logging calls - self.step = self.wandb_args.pop("step", None) - - # initialize a W&B run - if wandb.run is None: - self.run = wandb.init(**self.wandb_args) - if self.wandb_config_args: - self.run.config.update(self.wandb_config_args) - else: - self.run = wandb.run - - self.printer = get_wandb_printer() - - def post_init(self, results: Dict[str, Any]) -> None: - self.results: Dict[str, Any] = copy.deepcopy(results) - self.task_names: List[str] = list(results.get("results", {}).keys()) - self.group_names: List[str] = list(results.get("groups", {}).keys()) - - def _get_config(self) -> Dict[str, Any]: - """Get configuration parameters.""" - self.task_configs = self.results.get("configs", {}) - cli_configs = self.results.get("config", {}) - configs = { - "task_configs": self.task_configs, - "cli_configs": cli_configs, - } - - return configs - - def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]: - """Sanitize the results dictionary.""" - _results = copy.deepcopy(self.results.get("results", dict())) - - # Remove None from the metric string name - tmp_results = copy.deepcopy(_results) - for task_name in self.task_names: - task_result = tmp_results.get(task_name, dict()) - for metric_name, metric_value in task_result.items(): - _metric_name, removed = remove_none_pattern(metric_name) - if removed: - _results[task_name][_metric_name] = metric_value - _results[task_name].pop(metric_name) - - # remove string valued keys from the results dict - wandb_summary = {} - for task in self.task_names: - task_result = _results.get(task, dict()) - for metric_name, metric_value in task_result.items(): - if isinstance(metric_value, str): - wandb_summary[f"{task}/{metric_name}"] = metric_value - - for summary_metric, summary_value in wandb_summary.items(): - _task, _summary_metric = summary_metric.split("/") - _results[_task].pop(_summary_metric) - - tmp_results = copy.deepcopy(_results) - for task_name, task_results in tmp_results.items(): - for metric_name, metric_value in task_results.items(): - _results[f"{task_name}/{metric_name}"] = metric_value - _results[task_name].pop(metric_name) - for task in self.task_names: - _results.pop(task) - - return wandb_summary, _results - - def _log_results_as_table(self) -> None: - """Generate and log evaluation results as a table to W&B.""" - columns = [ - "Version", - "Filter", - "num_fewshot", - "Metric", - "Value", - "Stderr", - ] - - def make_table(columns: List[str], key: str = "results"): - import wandb - - table = wandb.Table(columns=columns) - results = copy.deepcopy(self.results) - - for k, dic in results.get(key).items(): - if k in self.group_names and not key == "groups": - continue - version = results.get("versions").get(k) - if version == "N/A": - version = None - n = results.get("n-shot").get(k) - - for (mf), v in dic.items(): - m, _, f = mf.partition(",") - if m.endswith("_stderr"): - continue - if m == "alias": - continue - - if m + "_stderr" + "," + f in dic: - se = dic[m + "_stderr" + "," + f] - if se != "N/A": - se = "%.4f" % se - table.add_data(*[k, version, f, n, m, str(v), str(se)]) - else: - table.add_data(*[k, version, f, n, m, str(v), ""]) - - return table - - # log the complete eval result to W&B Table - table = make_table(["Tasks"] + columns, "results") - self.run.log({"evaluation/eval_results": table}, step=self.step) - - if "groups" in self.results.keys(): - table = make_table(["Groups"] + columns, "groups") - self.run.log({"evaluation/group_eval_results": table}, step=self.step) - - def _log_results_as_artifact(self) -> None: - """Log results as JSON artifact to W&B.""" - import wandb - - dumped = json.dumps( - self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False - ) - artifact = wandb.Artifact("results", type="eval_results") - with artifact.new_file("results.json", mode="w", encoding="utf-8") as f: - f.write(dumped) - self.run.log_artifact(artifact) - - def log_eval_result(self) -> None: - """Log evaluation results to W&B.""" - # Log configs to wandb - configs = self._get_config() - self.run.config.update(configs, allow_val_change=self.step is not None) - - wandb_summary, self.wandb_results = self._sanitize_results_dict() - # update wandb.run.summary with items that were removed - self.run.summary.update(wandb_summary) - # Log the evaluation metrics to wandb - self.run.log(self.wandb_results, step=self.step) - # Log the evaluation metrics as W&B Table - self._log_results_as_table() - # Log the results dict as json to W&B Artifacts - self._log_results_as_artifact() - - def _generate_dataset( - self, data: List[Dict[str, Any]], config: Dict[str, Any] - ) -> pd.DataFrame: - """Generate a dataset from evaluation data. - - Args: - data (List[Dict[str, Any]]): The data to generate a dataset for. - config (Dict[str, Any]): The configuration of the task. - - Returns: - pd.DataFrame: A dataframe that is ready to be uploaded to W&B. - """ - ids = [x["doc_id"] for x in data] - labels = [x["target"] for x in data] - instance = [""] * len(ids) - resps = [""] * len(ids) - filtered_resps = [""] * len(ids) - model_outputs = {} - - metrics_list = config["metric_list"] - metrics = {} - for metric in metrics_list: - metric = metric.get("metric") - if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]: - metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data] - if metric in ["byte_perplexity", "bits_per_byte"]: - metrics[f"{metric}_bytes"] = [x[metric][1] for x in data] - else: - metrics[f"{metric}_words"] = [x[metric][1] for x in data] - else: - metrics[metric] = [x[metric] for x in data] - - if config["output_type"] == "loglikelihood": - instance = [x["arguments"][0][0] for x in data] - labels = [x["arguments"][0][1] for x in data] - resps = [ - f"log probability of continuation is {x['resps'][0][0][0]} " - + "\n\n" - + "continuation will {} generated with greedy sampling".format( - "not be" if not x["resps"][0][0][1] else "be" - ) - for x in data - ] - filtered_resps = [ - f"log probability of continuation is {x['filtered_resps'][0][0]} " - + "\n\n" - + "continuation will {} generated with greedy sampling".format( - "not be" if not x["filtered_resps"][0][1] else "be" - ) - for x in data - ] - elif config["output_type"] == "multiple_choice": - instance = [x["arguments"][0][0] for x in data] - choices = [ - "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])]) - for x in data - ] - resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data] - filtered_resps = [ - np.argmax([n[0] for n in x["filtered_resps"]]) for x in data - ] - elif config["output_type"] == "loglikelihood_rolling": - instance = [x["arguments"][0][0] for x in data] - resps = [x["resps"][0][0] for x in data] - filtered_resps = [x["filtered_resps"][0] for x in data] - elif config["output_type"] == "generate_until": - instance = [x["arguments"][0][0] for x in data] - resps = [x["resps"][0][0] for x in data] - filtered_resps = [x["filtered_resps"][0] for x in data] - - model_outputs["raw_predictions"] = resps - model_outputs["filtered_predictions"] = filtered_resps - - df_data = { - "id": ids, - "data": instance, - } - if config["output_type"] == "multiple_choice": - df_data["choices"] = choices - - tmp_data = { - "input_len": [len(x) for x in instance], - "labels": labels, - "output_type": config["output_type"], - } - df_data.update(tmp_data) - df_data.update(model_outputs) - df_data.update(metrics) - - return pd.DataFrame(df_data) - - def _log_samples_as_artifact( - self, data: List[Dict[str, Any]], task_name: str - ) -> None: - import wandb - - # log the samples as an artifact - dumped = json.dumps( - data, - indent=2, - default=_handle_non_serializable, - ensure_ascii=False, - ) - artifact = wandb.Artifact(f"{task_name}", type="samples_by_task") - with artifact.new_file( - f"{task_name}_eval_samples.json", mode="w", encoding="utf-8" - ) as f: - f.write(dumped) - self.run.log_artifact(artifact) - # artifact.wait() - - def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None: - """Log evaluation samples to W&B. - - Args: - samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task. - """ - task_names: List[str] = [ - x for x in self.task_names if x not in self.group_names - ] - - ungrouped_tasks = [] - tasks_by_groups = {} - - for task_name in task_names: - group_names = self.task_configs[task_name].get("group", None) - if group_names: - if isinstance(group_names, str): - group_names = [group_names] - - for group_name in group_names: - if not tasks_by_groups.get(group_name): - tasks_by_groups[group_name] = [task_name] - else: - tasks_by_groups[group_name].append(task_name) - else: - ungrouped_tasks.append(task_name) - - for task_name in ungrouped_tasks: - eval_preds = samples[task_name] - - # log the samples as a W&B Table - df = self._generate_dataset(eval_preds, self.task_configs.get(task_name)) - self.run.log({f"{task_name}_eval_results": df}, step=self.step) - - # log the samples as a json file as W&B Artifact - self._log_samples_as_artifact(eval_preds, task_name) - - for group, grouped_tasks in tasks_by_groups.items(): - grouped_df = pd.DataFrame() - for task_name in grouped_tasks: - eval_preds = samples[task_name] - df = self._generate_dataset( - eval_preds, self.task_configs.get(task_name) - ) - df["group"] = group - df["task"] = task_name - grouped_df = pd.concat([grouped_df, df], ignore_index=True) - - # log the samples as a json file as W&B Artifact - self._log_samples_as_artifact(eval_preds, task_name) - - self.run.log({f"{group}_eval_results": grouped_df}, step=self.step) diff --git a/lm-evaluation-harness/lm_eval/models/__init__.py b/lm-evaluation-harness/lm_eval/models/__init__.py deleted file mode 100644 index 8582f0198821166f67155d29d6a358a4869cdb5a..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -from . import ( - anthropic_llms, - api_models, - dummy, - gguf, - hf_audiolm, - hf_steered, - hf_vlms, - huggingface, - ibm_watsonx_ai, - mamba_lm, - nemo_lm, - neuralmagic, - neuron_optimum, - openai_completions, - optimum_ipex, - optimum_lm, - sglang_causallms, - sglang_generate_API, - textsynth, - vllm_causallms, - vllm_vlms, -) - - -# TODO: implement __all__ - - -try: - # enable hf hub transfer if available - import hf_transfer # type: ignore # noqa - import huggingface_hub.constants # type: ignore - - huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True -except ImportError: - pass diff --git a/lm-evaluation-harness/lm_eval/models/anthropic_llms.py b/lm-evaluation-harness/lm_eval/models/anthropic_llms.py deleted file mode 100644 index ed4599758f1c5635534655890917f24774bfaf5b..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/anthropic_llms.py +++ /dev/null @@ -1,367 +0,0 @@ -import logging -import os -from functools import cached_property -from typing import Any, Dict, List, Tuple, Union - -from tqdm import tqdm - -from lm_eval.api.model import LM -from lm_eval.api.registry import register_model -from lm_eval.models.openai_completions import LocalCompletionsAPI -from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions - - -eval_logger = logging.getLogger(__name__) - - -def anthropic_completion( - client, #: anthropic.Anthropic, - model: str, - prompt: str, - max_tokens_to_sample: int, - temperature: float, - stop: List[str], - **kwargs: Any, -) -> str: - """Wrapper function around the Anthropic completion API client with exponential back-off - in case of RateLimitError. - - params: - client: anthropic.Anthropic - Anthropic API client - model: str - Anthropic model e.g. 'claude-instant-v1', 'claude-2' - prompt: str - Prompt to feed to the model - max_tokens_to_sample: int - Maximum number of tokens to sample from the model - temperature: float - Sampling temperature - stop: List[str] - List of stop sequences - kwargs: Any - Additional model_args to pass to the API client - """ - - try: - import anthropic - except ModuleNotFoundError as exception: - raise type(exception)( - "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ -please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", - ) - - def _exception_callback(e: Exception, sleep_time: float) -> None: - eval_logger.warning( - f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds" - ) - - @retry_on_specific_exceptions( - on_exceptions=[anthropic.RateLimitError], - max_retries=None, # retry forever, consider changing - on_exception_callback=_exception_callback, - ) - def completion(): - response = client.completions.create( - prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}", - model=model, - # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences - # (e.g. gsm8k's ":") may truncate a lot of the input. - stop_sequences=[anthropic.HUMAN_PROMPT] + stop, - max_tokens_to_sample=max_tokens_to_sample, - temperature=temperature, - **kwargs, - ) - return response.completion - - return completion() - - -def anthropic_chat( - client, #: anthropic.Anthropic, - model: str, - prompt: str, - max_tokens: int, - temperature: float, - stop: List[str], - **kwargs: Any, -) -> str: - """Wrapper function around the Anthropic completion API client with exponential back-off - in case of RateLimitError. - - params: - client: anthropic.Anthropic - Anthropic API client - model: str - Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229' - prompt: str - Prompt to feed to the model - max_tokens: int - Maximum number of tokens to sample from the model - temperature: float - Sampling temperature - stop: List[str] - List of stop sequences - kwargs: Any - Additional model_args to pass to the API client - """ - - try: - import anthropic - except ModuleNotFoundError as exception: - raise type(exception)( - "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ -please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", - ) - - def _exception_callback(e: Exception, sleep_time: float) -> None: - eval_logger.warning( - f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds" - ) - - @retry_on_specific_exceptions( - on_exceptions=[ - anthropic.RateLimitError, - anthropic.APIConnectionError, - anthropic.APIStatusError, - ], - max_retries=None, # retry forever, consider changing - on_exception_callback=_exception_callback, - ) - def messages(): - response = client.messages.create( - model=model, - max_tokens=max_tokens, - temperature=temperature, - messages=[{"role": "user", "content": f"{prompt}"}], - **kwargs, - ) - return response.content[0].text - - return messages() - - -@register_model("anthropic-completions") -class AnthropicLM(LM): - REQ_CHUNK_SIZE = 20 # TODO: not used - - def __init__( - self, - batch_size: int = 1, - model: str = "claude-2.0", - max_tokens_to_sample: int = 256, - temperature: float = 0, # defaults to 1 - **kwargs, # top_p, top_k, etc. - ) -> None: - """Anthropic API wrapper. - - :param model: str - Anthropic model e.g. 'claude-instant-v1', 'claude-2' - :param max_tokens_to_sample: int - Maximum number of tokens to sample from the model - :param temperature: float - Sampling temperature - :param kwargs: Any - Additional model_args to pass to the API client - """ - super().__init__() - - try: - import anthropic - except ModuleNotFoundError as exception: - raise type(exception)( - "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ -please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", - ) - - self.model = model - # defaults to os.environ.get("ANTHROPIC_API_KEY") - self.client = anthropic.Anthropic() - self.temperature = temperature - self.max_tokens_to_sample = max_tokens_to_sample - self.tokenizer = self.client.get_tokenizer() - self.kwargs = kwargs - - @property - def eot_token_id(self): - # Not sure but anthropic.HUMAN_PROMPT ? - raise NotImplementedError("No idea about anthropic tokenization.") - - @property - def max_length(self) -> int: - return 2048 - - @property - def max_gen_toks(self) -> int: - return self.max_tokens_to_sample - - @property - def batch_size(self): - # Isn't used because we override _loglikelihood_tokens - raise NotImplementedError("No support for logits.") - - @property - def device(self): - # Isn't used because we override _loglikelihood_tokens - raise NotImplementedError("No support for logits.") - - def tok_encode(self, string: str) -> List[int]: - return self.tokenizer.encode(string).ids - - def tok_decode(self, tokens: List[int]) -> str: - return self.tokenizer.decode(tokens) - - def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): - raise NotImplementedError("No support for logits.") - - def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: - try: - import anthropic - except ModuleNotFoundError as exception: - raise type(exception)( - "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \ -please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`", - ) - - if not requests: - return [] - - _requests: List[Tuple[str, dict]] = [req.args for req in requests] - - res = [] - for request in tqdm(_requests, disable=disable_tqdm): - try: - inp = request[0] - request_args = request[1] - # generation_kwargs - until = request_args.get("until") - max_gen_toks = request_args.get("max_gen_toks", self.max_length) - temperature = request_args.get("temperature", self.temperature) - response = anthropic_completion( - client=self.client, - model=self.model, - prompt=inp, - max_tokens_to_sample=max_gen_toks, - temperature=temperature, # TODO: implement non-greedy sampling for Anthropic - stop=until, # type: ignore - **self.kwargs, - ) - res.append(response) - - self.cache_hook.add_partial("generate_until", request, response) - except anthropic.APIConnectionError as e: # type: ignore # noqa: F821 - eval_logger.critical(f"Server unreachable: {e.__cause__}") - break - except anthropic.APIStatusError as e: # type: ignore # noqa: F821 - eval_logger.critical(f"API error {e.status_code}: {e.message}") - break - - return res - - def _model_call(self, inps): - # Isn't used because we override _loglikelihood_tokens - raise NotImplementedError() - - def _model_generate(self, context, max_length, eos_token_id): - # Isn't used because we override generate_until - raise NotImplementedError() - - def loglikelihood(self, requests, disable_tqdm: bool = False): - raise NotImplementedError("No support for logits.") - - def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): - raise NotImplementedError("No support for logits.") - - -@register_model("anthropic-chat", "anthropic-chat-completions") -class AnthropicChat(LocalCompletionsAPI): - def __init__( - self, - base_url="https://api.anthropic.com/v1/messages", - tokenizer_backend=None, - **kwargs, - ): - super().__init__( - base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs - ) - eval_logger.warning( - "Chat completions does not support batching. Defaulting to batch size 1." - ) - self._batch_size = 1 - self.anthropic_version = "2023-06-01" - eval_logger.warning( - f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning" - ) - - @cached_property - def api_key(self): - """Override this property to return the API key for the API request.""" - key = os.environ.get("ANTHROPIC_API_KEY", None) - if key is None: - raise ValueError( - "API key not found. Please set the ANTHROPIC_API_KEY environment variable." - ) - return key - - @cached_property - def header(self): - return { - "x-api-key": f"{self.api_key}", - "anthropic-version": self.anthropic_version, - } - - def _create_payload( - self, - messages: List[Dict], - generate=True, - gen_kwargs: dict = None, - eos="\n\nHuman:", - **kwargs, - ) -> dict: - system = ( - messages[0].get("content") if messages[0].get("role") == "system" else None - ) - if system: - messages = messages[1:] - gen_kwargs.pop("do_sample", False) - max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) - temperature = gen_kwargs.pop("temperature", 0) - stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos) - if not isinstance(stop, list): - stop = [stop] - out = { - "messages": messages, - "model": self.model, - "max_tokens": max_tokens, - "temperature": temperature, - "stop_sequences": stop, - **gen_kwargs, - } - if system: - out["system"] = system - return out - - def parse_generations( - self, outputs: Union[Dict, List[Dict]], **kwargs - ) -> List[str]: - res = [] - if not isinstance(outputs, list): - outputs = [outputs] - for out in outputs: - for choices in out["content"]: - res.append(choices["text"]) - return res - - def tok_encode( - self, - string: str, - left_truncate_len=None, - add_special_tokens=None, - **kwargs, - ) -> List[str]: - return [string] - - def loglikelihood(self, requests, **kwargs): - raise NotImplementedError( - "Anthropic Chat Completions API does not support the return of loglikelihood" - ) diff --git a/lm-evaluation-harness/lm_eval/models/api_models.py b/lm-evaluation-harness/lm_eval/models/api_models.py deleted file mode 100644 index 23d122033ef2e8c7aaecc2bc29d9174612eb1a4c..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/api_models.py +++ /dev/null @@ -1,799 +0,0 @@ -import abc -import asyncio -import copy -import itertools -import json -import logging -from functools import cached_property -from typing import ( - TYPE_CHECKING, - Any, - Awaitable, - Callable, - Dict, - Iterable, - List, - Literal, - NamedTuple, - Optional, - Tuple, - Union, -) - - -try: - import requests - from aiohttp import ClientSession, ClientTimeout, TCPConnector - from tenacity import RetryError, retry, stop_after_attempt, wait_exponential - from tqdm import tqdm - from tqdm.asyncio import tqdm_asyncio -except ModuleNotFoundError: - pass - - -import base64 -from importlib.util import find_spec -from io import BytesIO - -from lm_eval import utils -from lm_eval.api.instance import Instance -from lm_eval.api.model import TemplateLM -from lm_eval.models.utils import Collator, chunks, configure_pad_token - - -if TYPE_CHECKING: - from PIL import Image - - -eval_logger = logging.getLogger(__name__) - -LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]] - - -# utility class to keep track of json encoded chats -class JsonChatStr(NamedTuple): - prompt: str - - def encode(self, encoding): - return self.prompt.encode(encoding) - - -def create_image_prompt( - imgs: list["Image.Image"], chat: dict, fmt: str = "PNG" -) -> dict: - """ - - Parameters - ---------- - img : list[PIL.Image.Image] - The list of images to encode to base64 - chat : dict - fmt : str, optional - Any format Pillow understands (e.g. "PNG", "JPEG"). - Defaults to "PNG". - - Returns - ------- - dict - """ - images = [] - for img in imgs: - buf = BytesIO() - img.save(buf, format=fmt) - img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") - img_dict = { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{img_b64}", "detail": "auto"}, - } - images.append(img_dict) - - # chat is in format of list[dict["role": "user"/"system", "content": str, "type": "text"],...] - # with images, we need "content" to be a list of dicts with "type" and "text"/"image_url" - # currently we do not support few-shots so only one user message - # text content also has placeholders, which apparently is not necessary for API class (confirm) - - if isinstance(chat[-1]["content"], list): - chat[-1]["content"] = images + chat[-1]["content"] - else: - text_content = {"type": "text", "text": chat[-1]["content"]} - chat[-1]["content"] = images + [text_content] - chat[-1].pop("type") - return chat - - -class TemplateAPI(TemplateLM): - MULTIMODAL = True - - def __init__( - self, - model: str = None, - pretrained: str = None, # `model` takes precedence over `pretrained` when passed. - base_url: str = None, - tokenizer: Optional[str] = None, - # Loglikelihood tasks require a tokenizer to calculate context lengths, - # however the requests can be sent as a string if the API doesn't support token inputs. - # use tokenized_requests=False - tokenizer_backend: Optional[ - Literal["tiktoken", "huggingface", "None", "none"] - ] = "huggingface", - truncate: bool = False, - # number of concurrent requests. More useful if not batching - num_concurrent: int = 1, - max_retries: int = 3, - max_gen_toks: int = 256, - batch_size: Union[str, int] = 1, - seed: int = 1234, - max_length: Optional[int] = 2048, - add_bos_token: bool = False, - custom_prefix_token_id: int = None, - # send the requests as tokens or strings - tokenized_requests: bool = True, - trust_remote_code: bool = False, - revision: Optional[str] = "main", - use_fast_tokenizer: bool = True, - verify_certificate: bool = True, - eos_string: str = None, - # timeout in seconds - timeout: int = 300, - max_images: int = 1, - **kwargs, - ) -> None: - super().__init__() - missing_packages = [ - pkg - for pkg in ["aiohttp", "tqdm", "tenacity", "requests"] - if find_spec(pkg) is None - ] - if missing_packages: - raise ModuleNotFoundError( - f"Attempted to use an API model, but the required packages {missing_packages} are not installed. " - 'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`' - ) - self.model = model or pretrained - self.base_url = base_url - self.tokenizer = tokenizer - if not isinstance(batch_size, int) and "auto" in batch_size: - eval_logger.warning( - "Automatic batch size is not supported for API models. Defaulting to batch size 1." - ) - elif int(batch_size) > 1: - eval_logger.warning( - "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths." - ) - self._batch_size = int(batch_size) if batch_size != "auto" else 1 - self._truncate = truncate - self._max_gen_toks = int(max_gen_toks) - self._seed = int(seed) - # max_length - 1 as we always have 1 token for generation - eval_logger.info(f"Using max length {max_length} - 1") - self.max_length = max_length - 1 - if int(num_concurrent) <= 1: - eval_logger.info( - "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1." - ) - self._concurrent = int(num_concurrent) - self.tokenizer_backend = ( - None if tokenizer_backend in ("None", "none") else tokenizer_backend - ) - self.add_bos_token = add_bos_token - self.custom_prefix_token_id = custom_prefix_token_id - self.tokenized_requests = tokenized_requests - self.max_retries = int(max_retries) - self.verify_certificate = verify_certificate - self._eos_string = eos_string - self.timeout = int(timeout) - self.max_images = int(max_images) - - eval_logger.info(f"Using tokenizer {self.tokenizer_backend}") - if self.tokenizer_backend is None: - self.tokenizer = None - self.tokenized_requests = False - else: - if self.tokenizer is None: - if self.tokenizer_backend == "huggingface": - import transformers - - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - self.tokenizer if self.tokenizer else self.model, - trust_remote_code=trust_remote_code, - revision=revision, - use_fast=use_fast_tokenizer, - ) - # Not used as the API will handle padding but to mirror the behavior of the HFLM - self.tokenizer = configure_pad_token(self.tokenizer) - elif self.tokenizer_backend == "tiktoken": - try: - import tiktoken - - self.tokenizer = tiktoken.encoding_for_model(self.model) - except ModuleNotFoundError as e: - raise ModuleNotFoundError( - "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. " - "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`." - ) from e - if "openai" not in self.base_url: - eval_logger.warning( - f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. " - "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken." - ) - else: - import transformers - - assert isinstance(tokenizer, str), "tokenizer must be a string" - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - tokenizer, - trust_remote_code=trust_remote_code, - revision=revision, - use_fast=use_fast_tokenizer, - ) - - @abc.abstractmethod - def _create_payload( - self, - messages: Union[List[List[int]], List[dict], List[str], str], - *, - generate: bool = True, - gen_kwargs: Optional[dict] = None, - seed: int = 1234, - eos: str = None, - **kwargs, - ) -> dict: - """This method is responsible for creating the json payload that will be sent to the API.""" - raise NotImplementedError - - def create_message( - self, - messages: Union[List[List[int]], List[str], List[JsonChatStr]], - generate=False, - ) -> Union[List[List[int]], List[dict], List[str], str]: - """Helper method to transform the prompt into the expected API input format. messages consist of batched requests""" - if isinstance(messages[0], JsonChatStr): - # for chat completions we need to decode the json string to list[dict,...] - assert self._batch_size == 1, ( - "non-tokenized chat requests are only supported with batch_size=1" - ) - # list[dict["role":..., "content":...],...] - return json.loads(messages[0].prompt) - - if not self.tokenized_requests: - # if messages are tokenized: - if isinstance(messages[0][0], int): - # assuming decoding is lossless. However, this is only for loglikelihood requests - # as we need to compute the context length. For generations, we don't need to tokenize. - messages = self.decode_batch(messages) - if self._batch_size <= 1: - # if batch is 1 return str - return messages[0] - else: - # list[str,...] - return messages - - # list[list[int], ...] - return messages - - @staticmethod - @abc.abstractmethod - def parse_logprobs( - outputs: Union[Any, List[Any]], - tokens: List[List[int]] = None, - ctxlen: List[int] = None, - **kwargs, - ) -> List[Tuple[float, bool]]: - """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples""" - raise NotImplementedError - - @staticmethod - @abc.abstractmethod - def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]: - """Method used to parse the generations from the (batched) API response. This method should return a list of str""" - raise NotImplementedError - - @cached_property - def api_key(self) -> str: - """Override this property to return the API key for the API request.""" - return "" - - @cached_property - def header(self) -> dict: - """Override this property to return the headers for the API request.""" - return {"Authorization": f"Bearer {self.api_key}"} - - @property - def tokenizer_name(self) -> str: - """Must be defined for LM subclasses which implement Chat Templating. - Should return the name of the tokenizer or chat template used. - Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used. - """ - return "" - - def apply_chat_template( - self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True - ) -> Union[str, JsonChatStr]: - """Applies a chat template to a list of chat history between user and model.""" - if self.tokenizer_backend == "huggingface" and self.tokenized_requests: - return self.tokenizer.apply_chat_template( - chat_history, - tokenize=False, - add_generation_prompt=add_generation_prompt, - continue_final_message=not add_generation_prompt, - ) - else: - # bit of a hack. We'll load back before sending to the API - return JsonChatStr( - json.dumps( - [{**item, "type": "text"} for item in chat_history], - ensure_ascii=False, - ) - ) - - @cached_property - def eot_token_id(self) -> Optional[int]: - if self.tokenizer is None: - return None - else: - if self.tokenizer_backend == "huggingface": - return self.tokenizer.eos_token_id - elif self.tokenizer_backend == "tiktoken": - return self.tokenizer.eot_token - - @cached_property - def eos_string(self) -> Optional[str]: - if self._eos_string: - return self._eos_string - elif self.tokenizer is not None: - if self.tokenizer_backend == "huggingface": - return self.tokenizer.eos_token - elif self.tokenizer_backend == "tiktoken": - return self.tokenizer.decode([self.tokenizer.eot_token]) - else: - eval_logger.warning( - "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args." - ) - return None - - @cached_property - def prefix_token_id(self) -> Optional[int]: - if self.tokenizer is None: - return None - else: - if self.custom_prefix_token_id is not None: - return self.custom_prefix_token_id - if self.tokenizer_backend == "huggingface": - if self.tokenizer.bos_token_id is not None: - return self.tokenizer.bos_token_id - return self.tokenizer.eos_token_id - else: - return self.tokenizer.eot_token - - def tok_encode( - self, - string: str, - left_truncate_len: int = None, - add_special_tokens: bool = False, - truncation: bool = False, - **kwargs, - ) -> Union[List[List[int]], List[int], List[str]]: - if self.tokenizer_backend is None: - return [string] - elif self.tokenizer_backend == "huggingface": - # by default for CausalLM - false or self.add_bos_token is set - if not add_special_tokens: - add_special_tokens = False or self.add_bos_token - encoding: Union[List[List[int]], List[int]] = self.tokenizer( - string, - add_special_tokens=add_special_tokens, - truncation=truncation, - return_attention_mask=False, - ).input_ids - - # left-truncate the encoded context to be at most `left_truncate_len` tokens long - if left_truncate_len: - if not isinstance(string, str): - encoding = [enc[-left_truncate_len:] for enc in encoding] - else: - encoding = encoding[-left_truncate_len:] - - return encoding - - else: - try: - encoding = self.tokenizer.encode(string) - except Exception: - encoding = self.tokenizer.encode_batch(string) - return encoding - - def decode_batch(self, tokens: List[List[int]]) -> List[str]: - if self.tokenizer_backend == "huggingface": - return self.tokenizer.batch_decode(tokens) - elif self.tokenizer_backend == "tiktoken": - return self.tokenizer.decode_batch(tokens) - - def model_call( - self, - messages: Union[List[List[int]], List[str], List[JsonChatStr]], - *, - generate: bool = True, - gen_kwargs: Optional[Dict] = None, - **kwargs, - ) -> Optional[dict]: - # !!! Copy: shared dict for each request, need new object !!! - gen_kwargs = copy.deepcopy(gen_kwargs) - try: - response = requests.post( - self.base_url, - json=self._create_payload( - self.create_message(messages), - generate=generate, - gen_kwargs=gen_kwargs, - seed=self._seed, - eos=self.eos_string, - **kwargs, - ), - headers=self.header, - verify=self.verify_certificate, - ) - if not response.ok: - eval_logger.warning( - f"API request failed with error message: {response.text}. Retrying..." - ) - response.raise_for_status() - return response.json() - except RetryError: - eval_logger.error( - "API request failed after multiple retries. Please check the API status." - ) - return None - - async def amodel_call( - self, - session: ClientSession, - messages: Union[List[List[int]], List[str], List[JsonChatStr]], - *, - generate: bool = True, - cache_keys: list = None, - ctxlens: Optional[List[int]] = None, - gen_kwargs: Optional[Dict] = None, - **kwargs, - ) -> Union[List[str], List[Tuple[float, bool]], None]: - # !!! Copy: shared dict for each request, need new object !!! - gen_kwargs = copy.deepcopy(gen_kwargs) - payload = self._create_payload( - self.create_message(messages), - generate=generate, - gen_kwargs=gen_kwargs, - seed=self._seed, - **kwargs, - ) - cache_method = "generate_until" if generate else "loglikelihood" - try: - async with session.post( - self.base_url, - json=payload, - headers=self.header, - ) as response: - if not response.ok: - error_text = await response.text() - eval_logger.warning( - f"API request failed with error message: {error_text}. Retrying..." - ) - # raising exception will retry the request - response.raise_for_status() - outputs = await response.json() - answers = ( - self.parse_generations( - outputs=outputs, - ) - if generate - else self.parse_logprobs( - outputs=outputs, - tokens=messages, - ctxlens=ctxlens, - ) - ) - if cache_keys: - for res, cache in zip(answers, cache_keys): - self.cache_hook.add_partial(cache_method, cache, res) - return answers - # If the retries also fail - except RetryError: - eval_logger.error( - "API request failed after multiple retries. Please check the API status." - ) - return None - - def batch_loglikelihood_requests( - self, chunks: Iterable[List[LogLikelihoodInputs]] - ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]: - inputs = [] - ctxlens = [] - cache_keys = [] - for chunk in chunks: - for cache_key, context_enc, continuation_enc in chunk: - # max_length - 1 as we always have 1 token for generation - inp = (context_enc + continuation_enc)[-self.max_length :] - if len(inp) < len(context_enc + continuation_enc): - eval_logger.warning( - f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context." - ) - ctxlen = len(context_enc) - max( - 0, len(context_enc) + len(continuation_enc) - self.max_length - ) - - inputs.append(inp) - ctxlens.append(ctxlen) - cache_keys.append(cache_key) - return inputs, ctxlens, cache_keys - - async def get_batched_requests( - self, - requests: list, - cache_keys: list, - *, - generate: bool = True, - ctxlens: List[int] = None, - **kwargs, - ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]: - ctxlens = ctxlens if ctxlens else [None] * len(requests) - conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate) - async with ClientSession( - connector=conn, timeout=ClientTimeout(total=self.timeout) - ) as session: - retry_: Callable[..., Awaitable[Any]] = retry( - stop=stop_after_attempt(self.max_retries), - wait=wait_exponential(multiplier=0.5, min=1, max=10), - reraise=True, - )(self.amodel_call) - # Create tasks for each batch of request - tasks = [ - asyncio.create_task( - retry_( - session=session, - messages=message, - cache_keys=cache_key, - generate=generate, - ctxlens=ctxlen, - **kwargs, - ) - ) - for message, cache_key, ctxlen in zip( - chunks(requests, n=self._batch_size), - chunks(cache_keys, n=self._batch_size), - chunks(ctxlens, n=self._batch_size), - ) - ] - - return await tqdm_asyncio.gather(*tasks, desc="Requesting API") - - def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: - assert self.tokenizer is not None, ( - "Tokenizer is required for loglikelihood tasks to compute context lengths." - ) - res = [] - - def _collate(req: LogLikelihoodInputs): - """Defines the key for the sorted method""" - # the negative sign on len(toks) sorts descending - this has a few advantages: - # - time estimates will always be over not underestimates, which is more useful for planning - # - to know the size of a batch when going through the list, you know the first one is always the batch - # padded context length. this is useful to simplify the batching logic and more importantly to make - # automatic adaptive batches much much easier to implement - # - any OOMs will happen right away rather than near the end - - toks = req[1] + req[2] - return -len(toks), tuple(toks) - - re_ord = Collator( - requests, - sort_fn=_collate, - group_by=None, - ) - # if concurrent then we'll batch in the async context - chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0) - if self._concurrent <= 1: - pbar = tqdm(desc="Requesting API", total=len(requests)) - for chunk in chunked: - inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk]) - - outputs = retry( - stop=stop_after_attempt(self.max_retries), - wait=wait_exponential(multiplier=0.5, min=1, max=10), - reraise=True, - )(self.model_call)(messages=inputs, generate=False) - if isinstance(outputs, dict): - outputs = [outputs] - for answer_, cache_key in zip( - self.parse_logprobs( - outputs=outputs, tokens=inputs, ctxlens=ctxlens - ), - cache_keys, - ): - if answer_ is not None: - res.append(answer_) - # cache requests that aren't from a loglikelihood_rolling request - if cache_key is not None: - self.cache_hook.add_partial( - "loglikelihood", cache_key, answer_ - ) - pbar.update(1) - else: - inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked) - res = itertools.chain.from_iterable( - asyncio.run( - self.get_batched_requests( - inputs, cache_keys, generate=False, ctxlens=ctxlens - ) - ) - ) - - return re_ord.get_original(res) - - def generate_until( - self, requests: List[Instance], disable_tqdm: bool = False - ) -> List[str]: - res = [] - - def _collate_gen(_requests): - # sort by the length of the non-tokenized contexts - return -len(_requests[0]) - - # Let the API deal with tokenization - if len(requests[0].args) > 2: - assert self.tokenizer is None, ( - "tokenizer is not supported for multimodal requests yet!" - ) - eval_logger.info( - f"Using max_images {self.max_images}. Set in the model args." - ) - requests, all_gen_kwargs, auxiliary_args = zip( - *(req.args for req in requests) - ) - requests = tuple( - JsonChatStr( - json.dumps( - create_image_prompt( - y["visual"][: self.max_images], json.loads(x.prompt) - ) - ) - ) - for x, y in zip(requests, auxiliary_args) - ) - else: - requests, all_gen_kwargs = zip(*(req.args for req in requests)) - if self.tokenized_requests: - encodings_list = self.tok_encode( - requests, add_special_tokens=self.add_bos_token - ) - else: - encodings_list = [None] * len(requests) - requests = [ - (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list) - ] - - re_ord = Collator( - requests, - sort_fn=_collate_gen, - group_by="gen_kwargs", - ) - chunked = re_ord.get_batched( - n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None - ) - if not self.tokenized_requests: - eval_logger.info( - "Tokenized requests are disabled. Context + generation length is not checked." - ) - if self._concurrent <= 1: - pbar = tqdm(desc="Requesting API", total=len(requests)) - for chunk in chunked: - contexts, all_gen_kwargs, encodings_list = zip(*chunk) - if self.tokenized_requests: - max_gen_toks = all_gen_kwargs[0].get( - "max_gen_toks", self._max_gen_toks - ) - max_context_len = self.max_length - max_gen_toks - - encodings_list = [x[-max_context_len:] for x in encodings_list] - - if any( - len(x) + max_gen_toks > self.max_length for x in encodings_list - ): - eval_logger.warning( - f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated." - ) - - req = encodings_list if self.tokenized_requests else contexts - outputs = retry( - stop=stop_after_attempt(self.max_retries), - wait=wait_exponential(multiplier=0.5, min=1, max=10), - reraise=True, - )(self.model_call)( - messages=req, - generate=True, - gen_kwargs=copy.deepcopy(all_gen_kwargs[0]), - ) - for generated_text, context in zip( - self.parse_generations( - outputs=outputs, - contexts=contexts, - ), - contexts, - ): - if generated_text is not None: - res.append(generated_text) - - # partial caching - if context is not None: - self.cache_hook.add_partial( - "generate_until", - (context, all_gen_kwargs[0]), - generated_text, - ) - pbar.update(1) - else: - for chunk in chunked: - contexts, all_gen_kwargs, encodings_list = zip(*chunk) - if self.tokenized_requests: - max_gen_toks = all_gen_kwargs[0].get( - "max_gen_toks", self._max_gen_toks - ) - max_context_len = self.max_length - max_gen_toks - - encodings_list = [x[-max_context_len:] for x in encodings_list] - - if any( - len(x) + max_gen_toks > self.max_length for x in encodings_list - ): - eval_logger.warning( - f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated." - ) - - req = encodings_list if self.tokenized_requests else contexts - results = itertools.chain.from_iterable( - asyncio.run( - self.get_batched_requests( - req, - cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts], - generate=True, - gen_kwargs=copy.deepcopy(all_gen_kwargs[0]), - ) - ) - ) - res.extend(results) - - return re_ord.get_original(res) - - def loglikelihood_rolling( - self, requests: List[Instance], disable_tqdm: bool = False - ) -> List[float]: - loglikelihoods = [] - - for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm): - rolling_token_windows = list( - map( - utils.make_disjoint_window, - utils.get_rolling_token_windows( - token_list=self.tok_encode(string), - prefix_token=self.prefix_token_id, - # max_seq_len - (1 for context) - max_seq_len=self.max_length - 1, - context_len=1, - ), - ) - ) - - # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case - rolling_token_windows = [(None,) + x for x in rolling_token_windows] - - string_nll = self._loglikelihood_tokens( - rolling_token_windows, - disable_tqdm=True, - ) - - # discard is_greedy - string_nll = [x[0] for x in string_nll] - - string_nll = sum(string_nll) - loglikelihoods.append(string_nll) - - # cache this loglikelihood_rolling request - self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll) - return loglikelihoods diff --git a/lm-evaluation-harness/lm_eval/models/dummy.py b/lm-evaluation-harness/lm_eval/models/dummy.py deleted file mode 100644 index 014ad49ee36f756acd0428340f945312d79590e8..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/dummy.py +++ /dev/null @@ -1,41 +0,0 @@ -import random - -from tqdm import tqdm - -from lm_eval.api.model import LM -from lm_eval.api.registry import register_model - - -@register_model("dummy") -class DummyLM(LM): - def __init__(self) -> None: - super().__init__() - - @classmethod - def create_from_arg_string(cls, arg_string, additional_config=None): - return cls() - - def loglikelihood(self, requests, disable_tqdm: bool = False): - res = [] - - for _ in tqdm(requests, disable=disable_tqdm): - res.append((-random.random(), False)) - - return res - - def generate_until(self, requests, disable_tqdm: bool = False): - res = [] - - for request in tqdm(requests, disable=disable_tqdm): - res.append("lol") - assert request.arguments[0].strip() != "" - - return res - - def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): - res = [] - - for _ in tqdm(requests, disable=disable_tqdm): - res.append(-random.random()) - - return res diff --git a/lm-evaluation-harness/lm_eval/models/gguf.py b/lm-evaluation-harness/lm_eval/models/gguf.py deleted file mode 100644 index 52aef0dee62edb2eb390fe695e939f8e06d0555f..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/gguf.py +++ /dev/null @@ -1,132 +0,0 @@ -import logging -import time - -import requests -from requests.exceptions import RequestException -from tqdm import tqdm - -from lm_eval.api.model import LM -from lm_eval.api.registry import register_model - - -logger = logging.getLogger(__name__) - - -def get_result(logprobs, context_length): - is_greedy = True - offsets = logprobs["text_offset"] - tokens = logprobs["tokens"] - tokens_logprobs = logprobs["token_logprobs"] - - idx = 0 - while offsets[idx] < context_length: - idx += 1 - continuation_logprobs = sum(tokens_logprobs[idx:-1]) - for i in range(idx, len(tokens)): - token = tokens[i] - top_tokens = logprobs["top_logprobs"][i] - top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x]) - if top_token != token: - is_greedy = False - break - - return continuation_logprobs, is_greedy - - -@register_model("gguf", "ggml") -class GGUFLM(LM): - def __init__(self, base_url=None, max_length=2048, **kwargs): - super().__init__() - self.base_url = base_url - assert self.base_url, "must pass `base_url` to use GGUF LM!" - self.logprobs = 10 - self.temperature = 0.0 - self.max_length = max_length - - def gguf_completion( - self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs - ): - for _ in range(retries): - try: - prompt = context - request = { - "prompt": prompt, - "logprobs": self.logprobs, - "temperature": self.temperature, - } - if continuation: - prompt += continuation - request.update({"prompt": prompt, "max_tokens": 1, "echo": True}) - if stop is not None: - request["stop"] = stop - response = requests.post( - f"{self.base_url}/v1/completions", json=request - ) - response.raise_for_status() - return response.json() - except RequestException as e: - logger.error(f"RequestException: {e}") - time.sleep(delay) # wait before retrying - else: - raise RuntimeError( - f"Failed to get a valid response after {retries} retries." - ) - - def loglikelihood(self, requests, disable_tqdm: bool = False): - if not requests: - return [] - res = [] - for context, continuation in tqdm( - [req.args for req in requests], disable=disable_tqdm - ): - response = self.gguf_completion(context=context, continuation=continuation) - if response and "choices" in response and response["choices"]: - choice = response["choices"][0] - logprobs = choice.get("logprobs") - if ( - logprobs - and "token_logprobs" in logprobs - and logprobs["token_logprobs"] - ): - logprob, is_greedy = get_result(logprobs, len(context)) - res.append((logprob, is_greedy)) - else: - logger.warning( - "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list." - ) - else: - logger.error( - f"Invalid response for loglikelihood. Response: {response}" - ) - assert False - return res - - def generate_until(self, requests, disable_tqdm: bool = False): - if not requests: - return [] - - res = [] - for request in tqdm([req.args for req in requests], disable=disable_tqdm): - inp = request[0] - request_args = request[1] - until = request_args.get("until", [""]) - response = self.gguf_completion(context=inp, stop=until) - if response and "choices" in response and response["choices"]: - choice = response["choices"][0] - if "text" in choice: - generated_text = choice["text"].strip() - res.append(generated_text) - else: - logger.error( - f"Invalid response for greedy_until. Response: {response}" - ) - res.append(None) # Add default value in case of error - else: - logger.error(f"Invalid response for greedy_until. Response: {response}") - res.append(None) # Add default value in case of error - return res - - def loglikelihood_rolling(self, requests, disable_tqdm: bool = False): - raise NotImplementedError( - "loglikelihood_rolling not yet supported for GGUF models" - ) diff --git a/lm-evaluation-harness/lm_eval/models/hf_audiolm.py b/lm-evaluation-harness/lm_eval/models/hf_audiolm.py deleted file mode 100644 index 082e21f9331a13ac719613cc7e9abcd563296692..0000000000000000000000000000000000000000 --- a/lm-evaluation-harness/lm_eval/models/hf_audiolm.py +++ /dev/null @@ -1,307 +0,0 @@ -import copy -from typing import Dict, List, Optional, Tuple, Union - -import torch -import transformers -from tqdm import tqdm -from transformers import BatchEncoding - -from lm_eval.api.instance import Instance -from lm_eval.api.registry import register_model -from lm_eval.models.huggingface import HFLM -from lm_eval.models.utils import ( - Collator, - replace_placeholders, - stop_sequences_criteria, -) - - -DEFAULT_AUDIO_PLACEHOLDERS = ["