Charlie81 commited on Jun 7, 2025

Commit

3bca14b

1 Parent(s): f9e2c3f

delete local harness and remove imports

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

lm-evaluation-harness/.coveragerc +0 -28
lm-evaluation-harness/.flake8 +0 -5
lm-evaluation-harness/.github/workflows/new_tasks.yml +0 -71
lm-evaluation-harness/.github/workflows/publish.yml +0 -97
lm-evaluation-harness/.github/workflows/unit_tests.yml +0 -114
lm-evaluation-harness/.gitignore +0 -47
lm-evaluation-harness/.pre-commit-config.yaml +0 -60
lm-evaluation-harness/CITATION.bib +0 -10
lm-evaluation-harness/CODEOWNERS +0 -1
lm-evaluation-harness/LICENSE.md +0 -21
lm-evaluation-harness/MANIFEST.in +0 -1
lm-evaluation-harness/README.md +0 -625
lm-evaluation-harness/ignore.txt +0 -8
lm-evaluation-harness/lm_eval/__init__.py +0 -7
lm-evaluation-harness/lm_eval/__main__.py +0 -530
lm-evaluation-harness/lm_eval/api/filter.py +0 -56
lm-evaluation-harness/lm_eval/api/group.py +0 -115
lm-evaluation-harness/lm_eval/api/instance.py +0 -38
lm-evaluation-harness/lm_eval/api/metrics.py +0 -578
lm-evaluation-harness/lm_eval/api/model.py +0 -493
lm-evaluation-harness/lm_eval/api/registry.py +0 -196
lm-evaluation-harness/lm_eval/api/samplers.py +0 -232
lm-evaluation-harness/lm_eval/api/task.py +0 -1879
lm-evaluation-harness/lm_eval/caching/cache.py +0 -59
lm-evaluation-harness/lm_eval/decontamination/__init__.py +0 -0
lm-evaluation-harness/lm_eval/decontamination/archiver.py +0 -174
lm-evaluation-harness/lm_eval/decontamination/decontaminate.py +0 -166
lm-evaluation-harness/lm_eval/decontamination/janitor.py +0 -328
lm-evaluation-harness/lm_eval/evaluator.py +0 -761
lm-evaluation-harness/lm_eval/evaluator_utils.py +0 -554
lm-evaluation-harness/lm_eval/filters/__init__.py +0 -25
lm-evaluation-harness/lm_eval/filters/custom.py +0 -17
lm-evaluation-harness/lm_eval/filters/decontamination.py +0 -25
lm-evaluation-harness/lm_eval/filters/extraction.py +0 -233
lm-evaluation-harness/lm_eval/filters/selection.py +0 -61
lm-evaluation-harness/lm_eval/filters/transformation.py +0 -122
lm-evaluation-harness/lm_eval/loggers/__init__.py +0 -2
lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py +0 -537
lm-evaluation-harness/lm_eval/loggers/utils.py +0 -149
lm-evaluation-harness/lm_eval/loggers/wandb_logger.py +0 -358
lm-evaluation-harness/lm_eval/models/__init__.py +0 -36
lm-evaluation-harness/lm_eval/models/anthropic_llms.py +0 -367
lm-evaluation-harness/lm_eval/models/api_models.py +0 -799
lm-evaluation-harness/lm_eval/models/dummy.py +0 -41
lm-evaluation-harness/lm_eval/models/gguf.py +0 -132
lm-evaluation-harness/lm_eval/models/hf_audiolm.py +0 -307
lm-evaluation-harness/lm_eval/models/hf_steered.py +0 -243
lm-evaluation-harness/lm_eval/models/hf_vlms.py +0 -757
lm-evaluation-harness/lm_eval/models/huggingface.py +0 -1480
lm-evaluation-harness/lm_eval/models/ibm_watsonx_ai.py +0 -445

lm-evaluation-harness/.coveragerc DELETED Viewed

@@ -1,28 +0,0 @@
-[run]
-# tasks that aren't wired up.
-omit =
-    lm_eval/tasks/quac.py
-    lm_eval/tasks/storycloze.py
-    lm_eval/tasks/cbt.py
-    lm_eval/tasks/sat.py
-    lm_eval/tasks/triviaqa.py
-    lm_eval/tasks/naturalqs.py
-    lm_eval/models/dummy.py
-[report]
-exclude_lines =
-    # Skip any pass lines such as may be used for @abstractmethod
-    pass
-    # Have to re-enable the standard pragma
-    pragma: no cover
-    # Don't complain about missing debug-only code:
-    def __repr__
-    if self\.debug
-    # Don't complain if tests don't hit defensive assertion code:
-    raise AssertionError
-    raise NotImplementedError
-    return NotImplemented

lm-evaluation-harness/.flake8 DELETED Viewed

@@ -1,5 +0,0 @@
-[flake8]
-ignore = E203, E266, E501, W503, F403, F401, C901
-max-line-length = 127
-max-complexity = 10
-select = B,C,E,F,W,T4,B9

lm-evaluation-harness/.github/workflows/new_tasks.yml DELETED Viewed

@@ -1,71 +0,0 @@
-name: Tasks Modified
-on:
-  push:
-    branches:
-      - 'main'
-  pull_request:
-    branches:
-      - 'main'
-  workflow_dispatch:
-# comment/edit out the above to stop/change the triggers
-jobs:
-  changed_files:
-    runs-on: ubuntu-latest  # windows-latest || macos-latest
-    timeout-minutes: 120
-    name: Scan for changed tasks
-    steps:
-      - name: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
-      # Uses the tj-actions/changed-files action to check for changes.
-      # The `files_yaml` input optionally takes a yaml string to specify filters,
-      # and prepends the filter name to the standard output names.
-      - name: Check task folders
-        id: changed-tasks
-        uses: tj-actions/changed-files@v46.0.5
-        with:
-          # tasks checks the tasks folder and api checks the api folder for changes
-          files_yaml: |
-            tasks:
-              - lm_eval/tasks/**
-            api:
-              - lm_eval/api/**
-          write_output_files: true
-    # The next step is optional; the files are written to the workspace by default (above).
-    # so it's just for debugging
-      - name: Run Tests
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: |
-          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
-          echo "One or more test file(s) has changed."
-          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
-      - name: Set up Python 3.9
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-          cache: 'pip'
-          cache-dependency-path: setup.py
-      - name: Install dependencies
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: |
-            python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu
-    #   Install optional git dependencies
-    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-      - name: Test with pytest
-        # if new tasks are added, run tests on them
-        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
-        # if api is modified, run tests on it
-      - name: Test more tasks with pytest
-        env:
-          API: true
-        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv

lm-evaluation-harness/.github/workflows/publish.yml DELETED Viewed

@@ -1,97 +0,0 @@
-name: Publish Python distribution to PyPI
-on:
-  push:
-    tags:
-      - '*'
-jobs:
-  build:
-    name: Build distribution
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.x"
-    - name: Check version consistency
-      run: |
-        # Extract version from pyproject.toml
-        PYPROJECT_VERSION=$(grep 'version = ' pyproject.toml | head -1 | cut -d'"' -f2)
-        # Extract version from __init__.py
-        INIT_VERSION=$(grep '__version__ = ' lm_eval/__init__.py | head -1 | cut -d'"' -f2)
-        echo "Version in pyproject.toml: $PYPROJECT_VERSION"
-        echo "Version in __init__.py: $INIT_VERSION"
-        # Check if versions match
-        if [ "$PYPROJECT_VERSION" != "$INIT_VERSION" ]; then
-          echo "Error: Version mismatch between pyproject.toml ($PYPROJECT_VERSION) and __init__.py ($INIT_VERSION)"
-          exit 1
-        fi
-        echo "Version check passed: $PYPROJECT_VERSION"
-    - name: Install pypa/build
-      run: >-
-        python3 -m
-        pip install
-        build
-        --user
-    - name: Build a binary wheel and a source tarball
-      run: python3 -m build
-    - name: Store the distribution packages
-      uses: actions/upload-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-  publish-to-pypi:
-    name: >-
-      Publish Python distribution to PyPI
-    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
-    needs:
-    - build
-    runs-on: ubuntu-latest
-    environment:
-      name: pypi
-      url: https://pypi.org/p/lm_eval
-    permissions:
-      id-token: write  # IMPORTANT: mandatory for trusted publishing
-    steps:
-    - name: Download all the dists
-      uses: actions/download-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-    - name: Publish distribution to PyPI
-      uses: pypa/gh-action-pypi-publish@release/v1
-  publish-to-testpypi:
-    name: Publish Python distribution to TestPyPI
-    needs:
-    - build
-    runs-on: ubuntu-latest
-    environment:
-      name: testpypi
-      url: https://test.pypi.org/p/lm_eval
-    permissions:
-      id-token: write  # IMPORTANT: mandatory for trusted publishing
-    steps:
-    - name: Download all the dists
-      uses: actions/download-artifact@v4
-      with:
-        name: python-package-distributions
-        path: dist/
-    - name: Publish distribution to TestPyPI
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        repository-url: https://test.pypi.org/legacy/

lm-evaluation-harness/.github/workflows/unit_tests.yml DELETED Viewed

@@ -1,114 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-# just comment out unwanted steps to turn off the test.
-name: Unit Tests
-on:
-  push:
-    branches:
-      - 'main'
-  pull_request:
-    branches:
-      - 'main'
-  workflow_dispatch:
-# Jobs run concurrently and steps run sequentially within a job.
-# jobs: linter and cpu_tests. Add more jobs/steps as required.
-jobs:
-  linter:
-    name: Linters
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python 3.9
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-          cache: pip
-          cache-dependency-path: pyproject.toml
-      - name: Pre-Commit
-        env:
-          SKIP: "no-commit-to-branch,mypy"
-        uses: pre-commit/action@v3.0.1
-  # Job 2
-  testcpu:
-    name: CPU Tests
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: true
-      matrix:
-        python-version: ["3.9", "3.10", "3.11"]
-    timeout-minutes: 30
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: pip
-          cache-dependency-path: pyproject.toml
-      # Cache HuggingFace cache directory for CPU tests
-      - name: Cache HuggingFace cache (CPU tests)
-        uses: actions/cache@v3
-        id: cache-hf-cpu
-        with:
-          path: ~/.cache/huggingface
-          key: ${{ runner.os }}-hf-cache-cpu
-          restore-keys: |
-            ${{ runner.os }}-hf-cache-cpu
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install hf_xet
-      - name: Test with pytest
-        run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
-        continue-on-error: true  # Continue workflow even if tests fail
-      # Save test artifacts
-      - name: Archive test artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: output_testcpu${{ matrix.python-version }}
-          path: |
-            test_logs/*
-#  testmodels:
-#    name: External LM Tests
-#    runs-on: ubuntu-latest
-#    timeout-minutes: 30
-#    steps:
-#      - name: Checkout Code
-#        uses: actions/checkout@v4
-#      - name: Set up Python 3.9
-#        uses: actions/setup-python@v5
-#        with:
-#          python-version: 3.9
-#          cache: pip
-#          cache-dependency-path: pyproject.toml
-#
-#      # Cache HuggingFace cache directory for External LM tests
-#      - name: Cache HuggingFace cache (External LM tests)
-#        uses: actions/cache@v3
-#        id: cache-hf-lm
-#        with:
-#          path: ~/.cache/huggingface
-#          key: ${{ runner.os }}-hf-cache-external-lm
-#          restore-keys: |
-#            ${{ runner.os }}-hf-cache-external-lm
-#
-#      - name: Install dependencies
-#        run: |
-#          python -m pip install --upgrade pip
-#          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
-#          pip install -U transformers peft accelerate
-#
-#      - name: Test with pytest
-#        run: python -m pytest tests/models --showlocals -s -vv
-#        continue-on-error: true  # Continue workflow even if tests fail

lm-evaluation-harness/.gitignore DELETED Viewed

@@ -1,47 +0,0 @@
-# macOS system files
-.DS_Store
-# Virtual environments
-.venv/
-venv/
-ENV/
-env/
-*.env
-# Python bytecode and build artifacts
-__pycache__/
-*.py[cod]
-*.so
-*.egg-info/
-build/
-dist/
-# IDE & editor settings
-.vscode/
-.idea/
-# Jupyter
-.ipynb_checkpoints/
-profile_default/
-ipython_config.py
-# Output and data
-output/
-data/
-temp/
-test_logs/
-# Caching
-lm_eval/caching/.cache
-lm_cache/
-# Logging
-*.log
-logs/
-# wandb experiment tracking
-wandb/
-examples/wandb/
-# PyInstaller
-*.spec

lm-evaluation-harness/.pre-commit-config.yaml DELETED Viewed

@@ -1,60 +0,0 @@
-# Ignore test linting to avoid conflicting changes to version stability.
-exclude: ^tests/testdata/
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-      - id: check-added-large-files
-      - id: check-ast
-      - id: check-byte-order-marker
-      - id: check-case-conflict
-      - id: check-json
-      - id: check-merge-conflict
-        args: [--assume-in-merge]
-      - id: check-symlinks
-      - id: check-yaml
-        args: ["--unsafe"]
-      - id: destroyed-symlinks
-      - id: detect-private-key
-      - id: end-of-file-fixer
-      - id: no-commit-to-branch
-        always_run: false
-      - id: requirements-txt-fixer
-      - id: trailing-whitespace
-        args: [--markdown-linebreak-ext=md]
-      - id: fix-byte-order-marker
-        exclude: docs/CNAME
-      - id: fix-encoding-pragma
-        args: [--remove]
-      - id: mixed-line-ending
-        args: [--fix=lf]
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.0
-    hooks:
-      # Run the linter.
-      - id: ruff
-        args:
-          - --fix
-        # Run the formatter.
-      - id: ruff-format
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.4.1
-    hooks:
-      - id: codespell
-        exclude: >
-          (?x)^(
-              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
-          )$
-        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
-  - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.29
-    hooks:
-      - id: pymarkdown
-        exclude: ^lm_eval/tasks/
-        args: [fix, -r]
-#  - repo: https://github.com/pre-commit/mirrors-mypy
-#    rev: v1.5.1
-#    hooks:
-#    - id: mypy
-#      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
-#      exclude: ^tests/.*$

lm-evaluation-harness/CITATION.bib DELETED Viewed

@@ -1,10 +0,0 @@
-@misc{eval-harness,
-  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
-  title        = {A framework for few-shot language model evaluation},
-  month        = 12,
-  year         = 2023,
-  publisher    = {Zenodo},
-  version      = {v0.4.0},
-  doi          = {10.5281/zenodo.10256836},
-  url          = {https://zenodo.org/records/10256836}
-}

lm-evaluation-harness/CODEOWNERS DELETED Viewed

	@@ -1 +0,0 @@
1	- * @baberabb @stellaathena

lm-evaluation-harness/LICENSE.md DELETED Viewed

@@ -1,21 +0,0 @@
-MIT License
-Copyright (c) 2020 EleutherAI
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

lm-evaluation-harness/MANIFEST.in DELETED Viewed

	@@ -1 +0,0 @@
1	- recursive-include tests

lm-evaluation-harness/README.md DELETED Viewed

@@ -1,625 +0,0 @@
-# Language Model Evaluation Harness
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836)
----
-## Latest News 📣
-- [2025/03] Added support for steering HF models!
-- [2025/02] Added [SGLang](https://docs.sglang.ai/) support!
-- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
-- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
-- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
----
-## Announcement
-**A new v0.4.0 release of lm-evaluation-harness is available** !
-New updates and features include:
-- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.**
-- Internal refactoring
-- Config-based task creation and configuration
-- Easier import and sharing of externally-defined task config YAMLs
-- Support for Jinja2 prompt design, easy modification of prompts + prompt imports from Promptsource
-- More advanced configuration options, including output post-processing, answer extraction, and multiple LM generations per document, configurable fewshot settings, and more
-- Speedups and new modeling libraries supported, including: faster data-parallel HF model usage, vLLM support, MPS support with HuggingFace, and more
-- Logging and usability changes
-- New tasks including CoT BIG-Bench-Hard, Belebele, user-defined task groupings, and more
-Please see our updated documentation pages in `docs/` for more details.
-Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!
----
-## Overview
-This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
-**Features:**
-- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
-- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
-- Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
-- Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
-- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
-- Support for local models and benchmarks.
-- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
-- Easy support for custom prompts and evaluation metrics.
-The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML.
-## Install
-To install the `lm-eval` package from the github repository, run:
-```bash
-git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
-```
-We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
-## Basic Usage
-### User Guide
-A user guide detailing the full list of supported arguments is provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
-A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --tasks list`. Task descriptions and links to corresponding subfolders are provided [here](./lm_eval/tasks/README.md).
-### Hugging Face `transformers`
-To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):
-```bash
-lm_eval --model hf \
-    --model_args pretrained=EleutherAI/gpt-j-6B \
-    --tasks hellaswag \
-    --device cuda:0 \
-    --batch_size 8
-```
-Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
-    --tasks lambada_openai,hellaswag \
-    --device cuda:0 \
-    --batch_size 8
-```
-Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported.
-Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
-    --tasks lambada_openai,hellaswag \
-    --device cuda:0 \
-    --batch_size auto:4
-```
-> [!Note]
-> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
-#### Multi-GPU Evaluation with Hugging Face `accelerate`
-We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
-To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows:
-```bash
-accelerate launch -m lm_eval --model hf \
-    --tasks lambada_openai,arc_easy \
-    --batch_size 16
-```
-(or via `accelerate launch --no-python lm_eval`).
-For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one.
-**WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used.
-The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*
-In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
-```bash
-lm_eval --model hf \
-    --tasks lambada_openai,arc_easy \
-    --model_args parallelize=True \
-    --batch_size 16
-```
-This means that your model's weights will be split across all available GPUs.
-For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well:
-- `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
-- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
-- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
-- `offload_folder`: a folder where model weights will be offloaded to disk if needed.
-The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU.
-```bash
-accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \
-    -m lm_eval --model hf \
-    --tasks lambada_openai,arc_easy \
-    --model_args parallelize=True \
-    --batch_size 16
-```
-To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism)
-**Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.**
-**Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).**
-### Steered Hugging Face `transformers` models
-To evaluate a Hugging Face `transformers` model with steering vectors applied, specify the model type as `steered` and provide the path to either a PyTorch file containing pre-defined steering vectors, or a CSV file that specifies how to derive steering vectors from pretrained `sparsify` or `sae_lens` models (you will need to install the corresponding optional dependency for this method).
-Specify pre-defined steering vectors:
-```python
-import torch
-steer_config = {
-    "layers.3": {
-        "steering_vector": torch.randn(1, 768),
-        "bias": torch.randn(1, 768),
-        "steering_coefficient": 1,
-        "action": "add"
-    },
-}
-torch.save(steer_config, "steer_config.pt")
-```
-Specify derived steering vectors:
-```python
-import pandas as pd
-pd.DataFrame({
-    "loader": ["sparsify"],
-    "action": ["add"],
-    "sparse_model": ["EleutherAI/sae-pythia-70m-32k"],
-    "hookpoint": ["layers.3"],
-    "feature_index": [30],
-    "steering_coefficient": [10.0],
-}).to_csv("steer_config.csv", index=False)
-```
-Run the evaluation harness with steering vectors applied:
-```bash
-lm_eval --model steered \
-    --model_args pretrained=EleutherAI/pythia-160m,steer_path=steer_config.pt \
-    --tasks lambada_openai,hellaswag \
-    --device cuda:0 \
-    --batch_size 8
-```
-### NVIDIA `nemo` models
-[NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo) is a generative AI framework built for researchers and pytorch developers working on language models.
-To evaluate a `nemo` model, start by installing NeMo following [the documentation](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#installation). We highly recommended to use the NVIDIA PyTorch or NeMo container, especially if having issues installing Apex or any other dependencies (see [latest released containers](https://github.com/NVIDIA/NeMo/releases)). Please also install the lm evaluation harness library following the instructions in [the Install section](https://github.com/EleutherAI/lm-evaluation-harness/tree/main?tab=readme-ov-file#install).
-NeMo models can be obtained through [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/models) or in [NVIDIA's Hugging Face page](https://huggingface.co/nvidia). In [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling) there are conversion scripts to convert the `hf` checkpoints of popular models like llama, falcon, mixtral or mpt to `nemo`.
-Run a `nemo` model on one GPU:
-```bash
-lm_eval --model nemo_lm \
-    --model_args path=<path_to_nemo_model> \
-    --tasks hellaswag \
-    --batch_size 32
-```
-It is recommended to unpack the `nemo` model to avoid the unpacking inside the docker container - it may overflow disk space. For that you can run:
-```bash
-mkdir MY_MODEL
-tar -xvf MY_MODEL.nemo -c MY_MODEL
-```
-#### Multi-GPU evaluation with NVIDIA `nemo` models
-By default, only one GPU is used. But we do support either data replication or tensor/pipeline parallelism during evaluation, on one node.
-1) To enable data replication, set the `model_args` of `devices` to the number of data replicas to run. For example, the command to run 8 data replicas over 8 GPUs is:
-```bash
-torchrun --nproc-per-node=8 --no-python lm_eval \
-    --model nemo_lm \
-    --model_args path=<path_to_nemo_model>,devices=8 \
-    --tasks hellaswag \
-    --batch_size 32
-```
-1) To enable tensor and/or pipeline parallelism, set the `model_args` of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. In addition, you also have to set up `devices` to be equal to the product of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. For example, the command to use one node of 4 GPUs with tensor parallelism of 2 and pipeline parallelism of 2 is:
-```bash
-torchrun --nproc-per-node=4 --no-python lm_eval \
-    --model nemo_lm \
-    --model_args path=<path_to_nemo_model>,devices=4,tensor_model_parallel_size=2,pipeline_model_parallel_size=2 \
-    --tasks hellaswag \
-    --batch_size 32
-```
-Note that it is recommended to substitute the `python` command by `torchrun --nproc-per-node=<number of devices> --no-python` to facilitate loading the model into the GPUs. This is especially important for large checkpoints loaded into multiple GPUs.
-Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism.
-#### Multi-GPU evaluation with OpenVINO models
-Pipeline parallelism during evaluation is supported with OpenVINO models
-To enable pipeline parallelism, set the `model_args` of `pipeline_parallel`. In addition, you also have to set up `device` to value `HETERO:<GPU index1>,<GPU index2>` for example `HETERO:GPU.1,GPU.0` For example, the command to use pipeline parallelism of 2 is:
-```bash
-lm_eval --model openvino \
-    --tasks wikitext \
-    --model_args pretrained=<path_to_ov_model>,pipeline_parallel=True \
-    --device HETERO:GPU.1,GPU.0
-```
-### Tensor + Data Parallel and Optimized Inference with `vLLM`
-We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
-```bash
-lm_eval --model vllm \
-    --model_args pretrained={model_name},tensor_parallel_size={GPUs_per_model},dtype=auto,gpu_memory_utilization=0.8,data_parallel_size={model_replicas} \
-    --tasks lambada_openai \
-    --batch_size auto
-```
-To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
-vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
-> [!Tip]
-> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
-> [!Tip]
-> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k.
-### Tensor + Data Parallel and Fast Offline Batching Inference with `SGLang`
-We support SGLang for efficient offline batch inference. Its **[Fast Backend Runtime](https://docs.sglang.ai/index.html)** delivers high performance through optimized memory management and parallel processing techniques. Key features include tensor parallelism, continuous batching, and support for various quantization methods (FP8/INT4/AWQ/GPTQ).
-To use SGLang as the evaluation backend, please **install it in advance** via SGLang documents [here](https://docs.sglang.ai/start/install.html#install-sglang).
-> [!Tip]
-> Due to the installing method of [`Flashinfer`](https://docs.flashinfer.ai/)-- a fast attention kernel library, we don't include the dependencies of `SGLang` within [pyproject.toml](pyproject.toml). Note that the `Flashinfer` also has some requirements on `torch` version.
-SGLang's server arguments are slightly different from other backends, see [here](https://docs.sglang.ai/backend/server_arguments.html) for more information. We provide an example of the usage here:
-```bash
-lm_eval --model sglang \
-    --model_args pretrained={model_name},dp_size={data_parallel_size},tp_size={tensor_parallel_size},dtype=auto \
-    --tasks gsm8k_cot \
-    --batch_size auto
-```
-> [!Tip]
-> When encountering out of memory (OOM) errors (especially for multiple-choice tasks), try these solutions:
->
-> 1. Use a manual `batch_size`, rather than `auto`.
-> 2. Lower KV cache pool memory usage by adjusting `mem_fraction_static` - Add to your model arguments for example `--model_args pretrained=...,mem_fraction_static=0.7`.
-> 3. Increase tensor parallel size `tp_size` (if using multiple GPUs).
-### Model APIs and Inference Servers
-Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
-To call a hosted model, use:
-```bash
-export OPENAI_API_KEY=YOUR_KEY_HERE
-lm_eval --model openai-completions \
-    --model_args model=davinci-002 \
-    --tasks lambada_openai,hellaswag
-```
-We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.
-```bash
-lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16
-```
-Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
-| API or Inference Server                                                                                                   | Implemented?                                                                                            | `--model <xxx>` name                                | Models supported:                                                                                                                                                                                                                                                                                                                                          | Request Types:                                                                 |
-| --------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
-| OpenAI Completions                                                                                                        | :heavy_check_mark:                                                                                      | `openai-completions`, `local-completions`           | All OpenAI Completions API models                                                                                                                                                                                                                                                                                                                          | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:                                                                                      | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                                                                                                                                                                                                                                                                              | `generate_until` (no logprobs)                                                 |
-| Anthropic                                                                                                                 | :heavy_check_mark:                                                                                      | `anthropic`                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)                                                                                                                                                                                                                                                               | `generate_until` (no logprobs)                                                 |
-| Anthropic Chat                                                                                                            | :heavy_check_mark:                                                                                      | `anthropic-chat`, `anthropic-chat-completions`      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)                                                                                                                                                                                                                                                                      | `generate_until` (no logprobs)                                                 |
-| Textsynth                                                                                                                 | :heavy_check_mark:                                                                                      | `textsynth`                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                                                                                                                                                                                                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                                                                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:                                                                                      | `gguf`, `ggml`                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                                                                                                                                                                                                                                                                                | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
-| vLLM                                                                                                                      | :heavy_check_mark:                                                                                      | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Mamba                                                                                                                     | :heavy_check_mark:                                                                                      | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Huggingface Optimum (Causal LMs)                                                                                          | :heavy_check_mark:                                                                                      | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Huggingface Optimum-intel IPEX (Causal LMs)                                                                               | :heavy_check_mark:                                                                                      | `ipex`                                              | Any decoder-only AutoModelForCausalLM                                                                                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Neuron via AWS Inf2 (Causal LMs)                                                                                          | :heavy_check_mark:                                                                                      | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)                                                      | :heavy_check_mark:                                                                                      | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       | `generate_until`, `loglikelihood`                                              |
-| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)                                                          | :heavy_check_mark:                                                                                      | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| NVIDIA NeMo                                                                                                               | :heavy_check_mark:                                                                                      | `nemo_lm`                                           | [All supported models](https://docs.nvidia.com/nemo-framework/user-guide/24.09/nemotoolkit/core/core.html#nemo-models)                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-| Watsonx.ai                                                                                                                | :heavy_check_mark:                                                                                      | `watsonx_llm`                                       | [Supported Watsonx.ai Engines](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx)                                                                                                                                                                                                                                 | `generate_until` `loglikelihood`                                               |
-| [Your local inference server!](docs/API_guide.md)                                                                         | :heavy_check_mark:                                                                                      | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                     |
-Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
-For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface).
-> [!Note]
-> For best performance with closed chat model APIs such as Anthropic Claude 3 and GPT-4, we recommend carefully looking at a few sample outputs using `--limit 10` first to confirm answer extraction and scoring on generative tasks is performing as expected. providing `system="<some system prompt here>"` within `--model_args` for anthropic-chat-completions, to instruct the model what format to respond in, may be useful.
-### Other Frameworks
-A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
-To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
-### Additional Features
-> [!Note]
-> For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation.
-If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). **Note that the PyTorch MPS backend is still in early stages of development, so correctness issues or unsupported operations may exist. If you observe oddities in model performance on the MPS back-end, we recommend first checking that a forward pass of your model on `--device cpu` and `--device mps` match.**
-> [!Note]
-> You can inspect what the LM inputs look like by running the following command:
->
-> ```bash
-> python write_out.py \
->     --tasks <task1,task2,...> \
->     --num_fewshot 5 \
->     --num_examples 10 \
->     --output_base_path /path/to/output/folder
-> ```
->
-> This will write out one text file for each task.
-To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
-```bash
-lm_eval --model openai \
-    --model_args engine=davinci-002 \
-    --tasks lambada_openai,hellaswag \
-    --check_integrity
-```
-## Advanced Usage Tips
-For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
-    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
-    --device cuda:0
-```
-Models provided as delta weights can be easily loaded using the Hugging Face transformers library. Within --model_args, set the delta argument to specify the delta weights, and use the pretrained argument to designate the relative base model to which they will be applied:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=Ejafa/llama_7B,delta=lmsys/vicuna-7b-delta-v1.1 \
-    --tasks hellaswag
-```
-GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
-GPTQModel: add `,gptqmodel=True` to `model_args`
-```bash
-lm_eval --model hf \
-    --model_args pretrained=model-name-or-path,gptqmodel=True \
-    --tasks hellaswag
-```
-AutoGPTQ: add `,autogptq=True` to `model_args`:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
-    --tasks hellaswag
-```
-We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
-## Saving & Caching Results
-To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
-> [!TIP]
-> Use `--use_cache <DIR>` to cache evaluation results and skip previously evaluated samples when resuming runs of the same (model, task) pairs. Note that caching is rank-dependent, so restart with the same GPU count if interrupted. You can also use --cache_requests to save dataset preprocessing steps for faster evaluation resumption.
-To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the  HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:
-```bash
-lm_eval --model hf \
-    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
-    --tasks hellaswag \
-    --log_samples \
-    --output_path results \
-    --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
-```
-This allows you to easily download the results and samples from the Hub, using:
-```python
-from datasets import load_dataset
-load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest")
-```
-For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
-## Visualizing Results
-You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno.
-### Zeno
-You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs.
-First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account).
-Add this key as an environment variable:
-```bash
-export ZENO_API_KEY=[your api key]
-```
-You'll also need to install the `lm_eval[zeno]` package extra.
-To visualize the results, run the eval harness with the `log_samples` and `output_path` flags.
-We expect `output_path` to contain multiple folders that represent individual model names.
-You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.
-```bash
-lm_eval \
-    --model hf \
-    --model_args pretrained=EleutherAI/gpt-j-6B \
-    --tasks hellaswag \
-    --device cuda:0 \
-    --batch_size 8 \
-    --log_samples \
-    --output_path output/gpt-j-6B
-```
-Then, you can upload the resulting data using the `zeno_visualize` script:
-```bash
-python scripts/zeno_visualize.py \
-    --data_path output \
-    --project_name "Eleuther Project"
-```
-This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
-If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
-You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb).
-### Weights and Biases
-With the [Weights and Biases](https://wandb.ai/site) integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.
-The integration provide functionalities
-- to automatically log the evaluation results,
-- log the samples as W&B Tables for easy visualization,
-- log the `results.json` file as an artifact for version control,
-- log the `<task_name>_eval_samples.json` file if the samples are logged,
-- generate a comprehensive report for analysis and visualization with all the important metric,
-- log task and cli specific configs,
-- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.
-First you'll need to install the lm_eval[wandb] package extra. Do `pip install lm_eval[wandb]`.
-Authenticate your machine with an your unique W&B token. Visit https://wandb.ai/authorize to get one. Do `wandb login` in your command line terminal.
-Run eval harness as usual with a `wandb_args` flag. Use this flag to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.
-```bash
-lm_eval \
-    --model hf \
-    --model_args pretrained=microsoft/phi-2,trust_remote_code=True \
-    --tasks hellaswag,mmlu_abstract_algebra \
-    --device cuda:0 \
-    --batch_size 8 \
-    --output_path output/phi-2 \
-    --limit 10 \
-    --wandb_args project=lm-eval-harness-integration \
-    --log_samples
-```
-In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI.
-## How to Contribute or Learn More?
-For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
-### Implementing new tasks
-To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
-In general, we follow this priority list for addressing concerns about prompting and other eval details:
-1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure.
-2. If there is a clear and unambiguous official implementation, use that procedure.
-3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure.
-4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers.
-These are guidelines and not rules, and can be overruled in special circumstances.
-We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) as our original goal was specifically to compare results with that paper.
-### Support
-The best way to get support is to open an issue on this repo or join the [EleutherAI Discord server](https://discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases. If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!
-## Optional Extras
-Extras dependencies can be installed via `pip install -e ".[NAME]"`
-| Name                 | Use                                                   |
-| -------------------- | ----------------------------------------------------- |
-| api                  | For using api models (Anthropic, OpenAI API)          |
-| audiolm_qwen         | For running Qwen2 audio models                        |
-| deepsparse           | For running NM's DeepSparse models                    |
-| dev                  | For linting PRs and contributions                     |
-| gptq                 | For loading models with AutoGPTQ                      |
-| gptqmodel            | For loading models with GPTQModel                     |
-| hf_transfer          | For speeding up HF Hub file downloads                 |
-| ibm_watsonx_ai       | For using IBM watsonx.ai model apis                   |
-| ifeval               | For running the IFEval task                           |
-| ipex                 | For running on optimum-intel ipex backend             |
-| japanese_leaderboard | For running Japanese LLM Leaderboard tasks            |
-| longbench            | For running LongBench tasks                           |
-| mamba                | For loading Mamba SSM models                          |
-| math                 | For running math task answer checking                 |
-| multilingual         | For multilingual tokenizers                           |
-| neuronx              | For running on AWS inf2 instances                     |
-| optimum              | For running Intel OpenVINO models                     |
-| promptsource         | For using PromptSource prompts                        |
-| ruler                | For running RULER tasks                               |
-| sae_lens             | For using SAELens to steer models                     |
-| sentencepiece        | For using the sentencepiece tokenizer                 |
-| sparseml             | For using NM's SparseML models                        |
-| sparsify             | For using Sparsify to steer models                    |
-| testing              | For running library test suite                        |
-| vllm                 | For loading models with vLLM                          |
-| wandb                | For integration with `Weights and Biases` platform    |
-| zeno                 | For visualizing results with Zeno                     |
-| -------------------- | ----------------------------------------------------- |
-| all                  | Loads all extras (not recommended)                    |
-## Cite as
-```text
-@misc{eval-harness,
-  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
-  title        = {The Language Model Evaluation Harness},
-  month        = 07,
-  year         = 2024,
-  publisher    = {Zenodo},
-  version      = {v0.4.3},
-  doi          = {10.5281/zenodo.12608602},
-  url          = {https://zenodo.org/records/12608602}
-}
-```

lm-evaluation-harness/ignore.txt DELETED Viewed

@@ -1,8 +0,0 @@
-ROUGE
-rouge
-nin
-maka
-mor
-te
-ond
-extraversion

lm-evaluation-harness/lm_eval/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-import logging
-import os
-from .evaluator import evaluate, simple_evaluate
-__version__ = "0.4.8"

lm-evaluation-harness/lm_eval/__main__.py DELETED Viewed

@@ -1,530 +0,0 @@
-import argparse
-import json
-import logging
-import os
-import sys
-from functools import partial
-from pathlib import Path
-from typing import Union
-from lm_eval import evaluator, utils
-from lm_eval.evaluator import request_caching_arg_to_dict
-from lm_eval.loggers import EvaluationTracker, WandbLogger
-from lm_eval.tasks import TaskManager
-from lm_eval.utils import (
-    handle_non_serializable,
-    make_table,
-    simple_parse_args_string,
-)
-def try_parse_json(value: str) -> Union[str, dict, None]:
-    if value is None:
-        return None
-    try:
-        return json.loads(value)
-    except json.JSONDecodeError:
-        if "{" in value:
-            raise argparse.ArgumentTypeError(
-                f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
-            )
-        return value
-def _int_or_none_list_arg_type(
-    min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
-):
-    def parse_value(item):
-        item = item.strip().lower()
-        if item == "none":
-            return None
-        try:
-            return int(item)
-        except ValueError:
-            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
-    items = [parse_value(v) for v in value.split(split_char)]
-    num_items = len(items)
-    if num_items == 1:
-        # Makes downstream handling the same for single and multiple values
-        items = items * max_len
-    elif num_items < min_len or num_items > max_len:
-        raise argparse.ArgumentTypeError(
-            f"Argument requires {max_len} integers or None, separated by '{split_char}'"
-        )
-    elif num_items != max_len:
-        logging.warning(
-            f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
-            "Missing values will be filled with defaults."
-        )
-        default_items = [parse_value(v) for v in defaults.split(split_char)]
-        items.extend(
-            default_items[num_items:]
-        )  # extend items list with missing defaults
-    return items
-def check_argument_types(parser: argparse.ArgumentParser):
-    """
-    Check to make sure all CLI args are typed, raises error if not
-    """
-    for action in parser._actions:
-        if action.dest != "help" and not action.const:
-            if action.type is None:
-                raise ValueError(
-                    f"Argument '{action.dest}' doesn't have a type specified."
-                )
-            else:
-                continue
-def setup_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument(
-        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
-    )
-    parser.add_argument(
-        "--tasks",
-        "-t",
-        default=None,
-        type=str,
-        metavar="task1,task2",
-        help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
-    )
-    parser.add_argument(
-        "--model_args",
-        "-a",
-        default="",
-        type=try_parse_json,
-        help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""",
-    )
-    parser.add_argument(
-        "--num_fewshot",
-        "-f",
-        type=int,
-        default=None,
-        metavar="N",
-        help="Number of examples in few-shot context",
-    )
-    parser.add_argument(
-        "--batch_size",
-        "-b",
-        type=str,
-        default=1,
-        metavar="auto|auto:N|N",
-        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=None,
-        metavar="N",
-        help="Maximal batch size to try with --batch_size auto.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default=None,
-        help="Device to use (e.g. cuda, cuda:0, cpu).",
-    )
-    parser.add_argument(
-        "--output_path",
-        "-o",
-        default=None,
-        type=str,
-        metavar="DIR|DIR/file.json",
-        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
-    )
-    parser.add_argument(
-        "--limit",
-        "-L",
-        type=float,
-        default=None,
-        metavar="N|0<N<1",
-        help="Limit the number of examples per task. "
-        "If <1, limit is a percentage of the total number of examples.",
-    )
-    parser.add_argument(
-        "--samples",
-        "-E",
-        default=None,
-        type=str,
-        metavar="/path/to/json",
-        help='JSON string or path to JSON file containing doc indices of selected examples to test. Format: {"task_name":[indices],...}',
-    )
-    parser.add_argument(
-        "--use_cache",
-        "-c",
-        type=str,
-        default=None,
-        metavar="DIR",
-        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
-    )
-    parser.add_argument(
-        "--cache_requests",
-        type=str,
-        default=None,
-        choices=["true", "refresh", "delete"],
-        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
-    )
-    parser.add_argument(
-        "--check_integrity",
-        action="store_true",
-        help="Whether to run the relevant part of the test suite for the tasks.",
-    )
-    parser.add_argument(
-        "--write_out",
-        "-w",
-        action="store_true",
-        default=False,
-        help="Prints the prompt for the first few documents.",
-    )
-    parser.add_argument(
-        "--log_samples",
-        "-s",
-        action="store_true",
-        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
-    )
-    parser.add_argument(
-        "--system_instruction",
-        type=str,
-        default=None,
-        help="System instruction to be used in the prompt",
-    )
-    parser.add_argument(
-        "--apply_chat_template",
-        type=str,
-        nargs="?",
-        const=True,
-        default=False,
-        help=(
-            "If True, apply chat template to the prompt. "
-            "Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
-            "To apply a specific template from the available list of templates, provide the template name as an argument. "
-            "E.g. `--apply_chat_template template_name`"
-        ),
-    )
-    parser.add_argument(
-        "--fewshot_as_multiturn",
-        action="store_true",
-        default=False,
-        help="If True, uses the fewshot as a multi-turn conversation",
-    )
-    parser.add_argument(
-        "--show_config",
-        action="store_true",
-        default=False,
-        help="If True, shows the the full config of all tasks at the end of the evaluation.",
-    )
-    parser.add_argument(
-        "--include_path",
-        type=str,
-        default=None,
-        metavar="DIR",
-        help="Additional path to include if there are external tasks to include.",
-    )
-    parser.add_argument(
-        "--gen_kwargs",
-        type=try_parse_json,
-        default=None,
-        help=(
-            "Either comma delimited string or JSON formatted arguments for model generation on greedy_until tasks,"
-            """ e.g. '{"temperature":0.7,"until":["hello"]}' or temperature=0,top_p=0.1."""
-        ),
-    )
-    parser.add_argument(
-        "--verbosity",
-        "-v",
-        type=str.upper,
-        default=None,
-        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
-        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
-    )
-    parser.add_argument(
-        "--wandb_args",
-        type=str,
-        default="",
-        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
-    )
-    parser.add_argument(
-        "--wandb_config_args",
-        type=str,
-        default="",
-        help="Comma separated string arguments passed to wandb.config.update. Use this to trace parameters that aren't already traced by default. eg. `lr=0.01,repeats=3",
-    )
-    parser.add_argument(
-        "--hf_hub_log_args",
-        type=str,
-        default="",
-        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
-    )
-    parser.add_argument(
-        "--predict_only",
-        "-x",
-        action="store_true",
-        default=False,
-        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
-    )
-    default_seed_string = "0,1234,1234,1234"
-    parser.add_argument(
-        "--seed",
-        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
-        default=default_seed_string,  # for backward compatibility
-        help=(
-            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
-            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
-            "respectively, or a single integer to set the same seed for all four.\n"
-            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
-            "(for backward compatibility).\n"
-            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
-            "Here numpy's seed is not set since the second value is `None`.\n"
-            "E.g, `--seed 42` sets all four seeds to 42."
-        ),
-    )
-    parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
-    )
-    parser.add_argument(
-        "--confirm_run_unsafe_code",
-        action="store_true",
-        help="Confirm that you understand the risks of running unsafe code for tasks that require it",
-    )
-    parser.add_argument(
-        "--metadata",
-        type=json.loads,
-        default=None,
-        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
-    )
-    return parser
-def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
-    check_argument_types(parser)
-    return parser.parse_args()
-def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-    if not args:
-        # we allow for args to be passed externally, else we parse them ourselves
-        parser = setup_parser()
-        args = parse_eval_args(parser)
-    if args.wandb_args:
-        wandb_args_dict = simple_parse_args_string(args.wandb_args)
-        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)
-        wandb_logger = WandbLogger(wandb_args_dict, wandb_config_args_dict)
-    utils.setup_logging(args.verbosity)
-    eval_logger = logging.getLogger(__name__)
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    # update the evaluation tracker args with the output path and the HF token
-    if args.output_path:
-        args.hf_hub_log_args += f",output_path={args.output_path}"
-    if os.environ.get("HF_TOKEN", None):
-        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
-    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
-    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
-    if args.predict_only:
-        args.log_samples = True
-    if (args.log_samples or args.predict_only) and not args.output_path:
-        raise ValueError(
-            "Specify --output_path if providing --log_samples or --predict_only"
-        )
-    if args.fewshot_as_multiturn and args.apply_chat_template is False:
-        raise ValueError(
-            "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
-        )
-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-    metadata = (
-        simple_parse_args_string(args.model_args)
-        if isinstance(args.model_args, str)
-        else args.model_args
-        if isinstance(args.model_args, dict)
-        else {}
-    ) | (
-        args.metadata
-        if isinstance(args.metadata, dict)
-        else simple_parse_args_string(args.metadata)
-    )
-    task_manager = TaskManager(include_path=args.include_path, metadata=metadata)
-    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
-        eval_logger.warning(
-            "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
-        )
-    if args.limit:
-        eval_logger.warning(
-            " --limit SHOULD ONLY BE USED FOR TESTING."
-            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    if args.samples:
-        assert args.limit is None, (
-            "If --samples is not None, then --limit must be None."
-        )
-        if (samples := Path(args.samples)).is_file():
-            args.samples = json.loads(samples.read_text())
-        else:
-            args.samples = json.loads(args.samples)
-    if args.tasks is None:
-        eval_logger.error("Need to specify task to evaluate.")
-        sys.exit()
-    elif args.tasks == "list":
-        print(task_manager.list_all_tasks())
-        sys.exit()
-    elif args.tasks == "list_groups":
-        print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
-        sys.exit()
-    elif args.tasks == "list_tags":
-        print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
-        sys.exit()
-    elif args.tasks == "list_subtasks":
-        print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
-        sys.exit()
-    else:
-        if os.path.isdir(args.tasks):
-            import glob
-            task_names = []
-            yaml_path = os.path.join(args.tasks, "*.yaml")
-            for yaml_file in glob.glob(yaml_path):
-                config = utils.load_yaml_config(yaml_file)
-                task_names.append(config)
-        else:
-            task_list = args.tasks.split(",")
-            task_names = task_manager.match_tasks(task_list)
-            for task in [task for task in task_list if task not in task_names]:
-                if os.path.isfile(task):
-                    config = utils.load_yaml_config(task)
-                    task_names.append(config)
-            task_missing = [
-                task for task in task_list if task not in task_names and "*" not in task
-            ]  # we don't want errors if a wildcard ("*") task name was used
-            if task_missing:
-                missing = ", ".join(task_missing)
-                eval_logger.error(
-                    f"Tasks were not found: {missing}\n"
-                    f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
-                )
-                raise ValueError(
-                    f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
-                )
-    # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
-    if args.trust_remote_code:
-        eval_logger.info(
-            "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
-        )
-        # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
-        # because it's already been determined based on the prior env var before launching our
-        # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
-        import datasets
-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
-        args.model_args = args.model_args + ",trust_remote_code=True"
-    (
-        eval_logger.info(f"Selected Tasks: {task_names}")
-        if eval_logger.getEffectiveLevel() >= logging.INFO
-        else print(f"Selected Tasks: {task_names}")
-    )
-    request_caching_args = request_caching_arg_to_dict(
-        cache_requests=args.cache_requests
-    )
-    results = evaluator.simple_evaluate(
-        model=args.model,
-        model_args=args.model_args,
-        tasks=task_names,
-        num_fewshot=args.num_fewshot,
-        batch_size=args.batch_size,
-        max_batch_size=args.max_batch_size,
-        device=args.device,
-        use_cache=args.use_cache,
-        limit=args.limit,
-        samples=args.samples,
-        check_integrity=args.check_integrity,
-        write_out=args.write_out,
-        log_samples=args.log_samples,
-        evaluation_tracker=evaluation_tracker,
-        system_instruction=args.system_instruction,
-        apply_chat_template=args.apply_chat_template,
-        fewshot_as_multiturn=args.fewshot_as_multiturn,
-        gen_kwargs=args.gen_kwargs,
-        task_manager=task_manager,
-        predict_only=args.predict_only,
-        random_seed=args.seed[0],
-        numpy_random_seed=args.seed[1],
-        torch_random_seed=args.seed[2],
-        fewshot_random_seed=args.seed[3],
-        confirm_run_unsafe_code=args.confirm_run_unsafe_code,
-        metadata=metadata,
-        **request_caching_args,
-    )
-    if results is not None:
-        if args.log_samples:
-            samples = results.pop("samples")
-        dumped = json.dumps(
-            results, indent=2, default=handle_non_serializable, ensure_ascii=False
-        )
-        if args.show_config:
-            print(dumped)
-        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
-        # Add W&B logging
-        if args.wandb_args:
-            try:
-                wandb_logger.post_init(results)
-                wandb_logger.log_eval_result()
-                if args.log_samples:
-                    wandb_logger.log_eval_samples(samples)
-            except Exception as e:
-                eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
-        evaluation_tracker.save_results_aggregated(
-            results=results, samples=samples if args.log_samples else None
-        )
-        if args.log_samples:
-            for task_name, config in results["configs"].items():
-                evaluation_tracker.save_results_samples(
-                    task_name=task_name, samples=samples[task_name]
-                )
-        if (
-            evaluation_tracker.push_results_to_hub
-            or evaluation_tracker.push_samples_to_hub
-        ):
-            evaluation_tracker.recreate_metadata_card()
-        print(
-            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
-            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
-        )
-        print(make_table(results))
-        if "groups" in results:
-            print(make_table(results, "groups"))
-        if args.wandb_args:
-            # Tear down wandb run once all the logging is done.
-            wandb_logger.run.finish()
-if __name__ == "__main__":
-    cli_evaluate()

lm-evaluation-harness/lm_eval/api/filter.py DELETED Viewed

@@ -1,56 +0,0 @@
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Callable, Iterable, List, Union
-from lm_eval.api.instance import Instance
-class Filter(ABC):
-    """
-    Filter classes operate on a per-task level.
-    They take all model outputs (`instance.resps` for all `task.instances`)
-    across all instances of a task, and perform operations.
-    In a single run, one can configure any number of separate filters or lists of filters.
-    """
-    def __init__(self, **kwargs) -> None:
-        """
-        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
-        """
-    @abstractmethod
-    def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable:
-        """
-        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
-        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
-        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
-        [<filtered resps for instance 0>, <filtered resps for instance 1>]
-        """
-        return resps
-@dataclass
-class FilterEnsemble:
-    """
-    FilterEnsemble creates a pipeline applying multiple filters.
-    Its intended usage is to stack multiple post-processing steps in order.
-    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
-    pipeline separately.
-    """
-    name: str
-    filters: List[Callable[[], Filter]]
-    def apply(self, instances: List[Instance]) -> None:
-        resps, docs = zip(*((inst.resps, inst.doc) for inst in instances))
-        resps, docs = list(resps), list(docs)
-        for f in self.filters:
-            # apply filters in sequence
-            resps = f().apply(resps, docs)
-        # add the end results after filtering to filtered_requests of their respective source instances.
-        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
-        for inst, resp in zip(instances, resps):
-            inst.filtered_resps[self.name] = resp

lm-evaluation-harness/lm_eval/api/group.py DELETED Viewed

@@ -1,115 +0,0 @@
-import abc
-from dataclasses import asdict, dataclass
-from inspect import getsource
-from typing import Any, Callable, List, Optional, Union
-@dataclass
-class AggMetricConfig(dict):
-    metric: Optional[str] = None
-    aggregation: Optional[str] = "mean"
-    weight_by_size: Optional[str] = False
-    # list of filter names which should be incorporated into the aggregated metric.
-    filter_list: Optional[Union[str, list]] = "none"
-    def __post_init__(self):
-        if self.aggregation != "mean" and not callable(self.aggregation):
-            raise ValueError(
-                f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'."
-            )
-        if isinstance(self.filter_list, str):
-            self.filter_list = [self.filter_list]
-@dataclass
-class GroupConfig(dict):
-    group: Optional[str] = None
-    group_alias: Optional[str] = None
-    task: Optional[Union[str, list]] = None
-    aggregate_metric_list: Optional[
-        Union[List[AggMetricConfig], AggMetricConfig, dict]
-    ] = None
-    metadata: Optional[dict] = (
-        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
-    )
-    def __getitem__(self, item):
-        return getattr(self, item)
-    def __setitem__(self, item, value):
-        return setattr(self, item, value)
-    def __post_init__(self):
-        if self.aggregate_metric_list is not None:
-            if isinstance(self.aggregate_metric_list, dict):
-                self.aggregate_metric_list = [self.aggregate_metric_list]
-            self.aggregate_metric_list = [
-                AggMetricConfig(**item) if isinstance(item, dict) else item
-                for item in self.aggregate_metric_list
-            ]
-    def to_dict(self, keep_callable: bool = False) -> dict:
-        """dumps the current config as a dictionary object, as a printable format.
-        null fields will not be printed.
-        Used for dumping results alongside full task configuration
-        :return: dict
-            A printable dictionary version of the TaskConfig object.
-        # TODO: should any default value in the TaskConfig not be printed?
-        """
-        cfg_dict = asdict(self)
-        # remove values that are `None`
-        for k, v in list(cfg_dict.items()):
-            if callable(v):
-                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
-        return cfg_dict
-    def serialize_function(
-        self, value: Union[Callable, str], keep_callable=False
-    ) -> Union[Callable, str]:
-        """Serializes a given function or string.
-        If 'keep_callable' is True, the original callable is returned.
-        Otherwise, attempts to return the source code of the callable using 'getsource'.
-        """
-        if keep_callable:
-            return value
-        else:
-            try:
-                return getsource(value)
-            except (TypeError, OSError):
-                return str(value)
-class ConfigurableGroup(abc.ABC):
-    def __init__(
-        self,
-        config: Optional[dict] = None,
-    ) -> None:
-        self._config = GroupConfig(**config)
-    @property
-    def group(self):
-        return self._config.group
-    @property
-    def group_alias(self):
-        return self._config.group_alias
-    @property
-    def version(self):
-        return self._config.version
-    @property
-    def config(self):
-        return self._config.to_dict()
-    @property
-    def group_name(self) -> Any:
-        return self._config.group
-    def __repr__(self):
-        return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"

lm-evaluation-harness/lm_eval/api/instance.py DELETED Viewed

@@ -1,38 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Literal, Optional, Tuple
-OutputType = Literal[
-    "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice"
-]
-@dataclass
-class Instance:
-    request_type: OutputType
-    doc: dict
-    arguments: tuple
-    idx: int
-    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
-        default_factory=lambda: (None, None, None)
-    )
-    resps: list = field(default_factory=list)
-    filtered_resps: dict = field(default_factory=dict)
-    # initialized after init
-    task_name: Optional[str] = None
-    doc_id: Optional[int] = None
-    repeats: Optional[int] = None
-    def __post_init__(self) -> None:
-        # unpack metadata field
-        self.task_name, self.doc_id, self.repeats = self.metadata
-    @property
-    def args(self):
-        """
-        Returns (string,) where `string` is the string to calculate loglikelihood over
-        """
-        return (
-            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
-        )

lm-evaluation-harness/lm_eval/api/metrics.py DELETED Viewed

@@ -1,578 +0,0 @@
-import logging
-import math
-import random
-import re
-import string
-from collections.abc import Iterable
-from typing import List
-import numpy as np
-import sacrebleu
-from lm_eval.api.registry import register_aggregation, register_metric
-eval_logger = logging.getLogger(__name__)
-# Register Aggregations First
-@register_aggregation("bypass")
-def bypass_agg(arr):
-    return 999
-@register_aggregation("nanmean")
-def nanmean(arr):
-    if len(arr) == 0 or all(np.isnan(arr)):
-        return np.nan
-    return np.nanmean(arr)
-@register_aggregation("mean")
-def mean(arr):
-    return sum(arr) / len(arr)
-@register_aggregation("median")
-def median(arr):
-    return arr[len(arr) // 2]
-# Certain metrics must be calculated across all documents in a benchmark.
-# We use them as aggregation metrics, paired with no-op passthrough metric fns.
-@register_aggregation("perplexity")
-def perplexity(items):
-    return math.exp(-mean(items))
-@register_aggregation("weighted_perplexity")
-def weighted_perplexity(items):
-    return math.exp(-weighted_mean(items))
-@register_aggregation("bits_per_byte")
-def bits_per_byte(items):
-    return -weighted_mean(items) / math.log(2)
-@register_aggregation("f1")
-def f1_score(items):
-    from sklearn.metrics import f1_score
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    fscore = f1_score(golds, preds)
-    return np.max(fscore)
-@register_aggregation("matthews_corrcoef")
-def matthews_corrcoef(items):
-    from sklearn.metrics import matthews_corrcoef
-    unzipped_list = list(zip(*items))
-    golds = unzipped_list[0]
-    preds = unzipped_list[1]
-    return matthews_corrcoef(golds, preds)
-@register_aggregation("bleu")
-def bleu(items):
-    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
-    for evaluating a generated sentence to a reference sentence. It counts matching
-    n-grams in the candidate translation to n-grams in the reference text, where
-    1-gram or unigram would be each token and a bigram comparison would be each
-    word pair. The comparison is made regardless of word order
-    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
-    Paper: https://www.aclweb.org/anthology/P02-1040/
-    Higher is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_bleu(preds, refs).score
-@register_aggregation("chrf")
-def chrf(items):
-    """chrF++ is a tool for automatic evaluation of machine translation output
-    based on character n-gram precision and recall enhanced with word n-grams.
-    Source: https://github.com/m-popovic/chrF
-    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-    Higher is better  # TODO I think
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_chrf(preds, refs).score
-@register_aggregation("ter")
-def ter(items):
-    """Translation Error Rate is an error metric for machine translation that
-    measures the number of edits required to change a system output into one
-    of the references
-    Source: http://www.cs.umd.edu/~snover/tercom/
-    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
-    Lower is better
-    """
-    refs = list(zip(*items))[0]
-    preds = list(zip(*items))[1]
-    refs, preds = _sacreformat(refs, preds)
-    return sacrebleu.corpus_ter(preds, refs).score
-@register_aggregation("brier_score")
-def brier_score(items):  # This is a passthrough function
-    gold, predictions = list(zip(*items))
-    bs, num_class = np.array(predictions).shape
-    gold = list(gold)
-    gold_one_hot = np.eye(num_class)[gold]
-    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
-@register_metric(
-    metric="brier_score",
-    higher_is_better=False,
-    output_type=["multiple_choice"],
-    aggregation="brier_score",
-)
-def brier_score_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="acc",
-    higher_is_better=True,
-    output_type=["loglikelihood", "multiple_choice"],
-    aggregation="mean",
-)
-def acc_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="acc_norm",
-    higher_is_better=True,
-    output_type=["loglikelihood", "multiple_choice"],
-    aggregation="mean",
-)
-def acc_norm_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="acc_mutual_info",
-    higher_is_better=True,
-    output_type="multiple_choice",
-    aggregation="mean",
-)
-def acc_mutual_info_fn(items):  # This is a passthrough function
-    return items
-### the code used in the `exact_match_hf_evaluate` function is ported from
-### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
-### which is under the apache license.
-# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-def exact_match_hf_evaluate(
-    predictions,
-    references,
-    regexes_to_ignore=None,
-    ignore_case=False,
-    ignore_punctuation=False,
-    ignore_numbers=False,
-):
-    if regexes_to_ignore is not None:
-        for s in regexes_to_ignore:
-            predictions = np.array([re.sub(s, "", x) for x in predictions])
-            references = np.array([re.sub(s, "", x) for x in references])
-    else:
-        predictions = np.asarray(predictions)
-        references = np.asarray(references)
-    if ignore_case:
-        predictions = np.char.lower(predictions)
-        references = np.char.lower(references)
-    if ignore_punctuation:
-        repl_table = string.punctuation.maketrans("", "", string.punctuation)
-        predictions = np.char.translate(predictions, table=repl_table)
-        references = np.char.translate(references, table=repl_table)
-    if ignore_numbers:
-        repl_table = string.digits.maketrans("", "", string.digits)
-        predictions = np.char.translate(predictions, table=repl_table)
-        references = np.char.translate(references, table=repl_table)
-    score_list = predictions == references
-    return {"exact_match": np.mean(score_list)}
-###
-@register_metric(
-    metric="exact_match",
-    higher_is_better=True,
-    output_type="generate_until",
-    aggregation="mean",
-)
-def exact_match_fn(**kwargs):
-    return exact_match_hf_evaluate(**kwargs)
-@register_metric(
-    metric="perplexity",
-    higher_is_better=False,
-    output_type="loglikelihood",
-    aggregation="perplexity",
-)
-def perplexity_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="word_perplexity",
-    higher_is_better=False,
-    output_type="loglikelihood_rolling",
-    aggregation="weighted_perplexity",
-)
-def word_perplexity_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="byte_perplexity",
-    higher_is_better=False,
-    output_type="loglikelihood_rolling",
-    aggregation="weighted_perplexity",
-)
-def byte_perplexity_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="bits_per_byte",
-    higher_is_better=False,
-    output_type="loglikelihood_rolling",
-    aggregation="bits_per_byte",
-)
-def bits_per_byte_fn(items):  # This is a passthrough function
-    return items
-def pop_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
-def sample_stddev(arr):
-    mu = mean(arr)
-    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
-def mean_stderr(arr):
-    return sample_stddev(arr) / math.sqrt(len(arr))
-@register_metric(
-    metric="bypass",
-    higher_is_better=True,
-    output_type=["loglikelihood", "multiple_choice", "generate_until"],
-    aggregation="bypass",
-)
-def bypass(items):
-    return None
-@register_metric(
-    metric="mcc",
-    higher_is_better=True,
-    output_type="multiple_choice",
-    aggregation="matthews_corrcoef",
-)
-def mcc_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="f1",
-    higher_is_better=True,
-    output_type="multiple_choice",
-    aggregation="f1",
-)
-def f1_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="bleu",
-    higher_is_better=True,
-    output_type="generate_until",
-    aggregation="bleu",
-)
-def bleu_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="chrf",
-    higher_is_better=True,
-    output_type="generate_until",
-    aggregation="chrf",
-)
-def chrf_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="ter",
-    higher_is_better=True,
-    output_type="generate_until",
-    aggregation="ter",
-)
-def ter_fn(items):  # This is a passthrough function
-    return items
-@register_metric(
-    metric="acc_all",
-    higher_is_better=True,
-    output_type="loglikelihood",
-    aggregation="mean",
-)
-def acc_all(items):
-    # Only count as correct if all answers are labeled correctly for each question
-    question_scoring_dict = {}
-    preds = list(zip(*items))[0]
-    docs = list(zip(*items))[1]
-    for doc, pred in zip(docs, preds):
-        paragraph_id = doc["idx"]["paragraph"]
-        question_id = doc["idx"]["question"]
-        if (paragraph_id, question_id) not in question_scoring_dict:
-            question_scoring_dict[(paragraph_id, question_id)] = []
-        gold_label = doc["label"] == 1
-        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
-    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
-    return acc
-def acc_all_stderr(items):
-    # Only count as correct if all answers are labeled correctly for each question
-    question_scoring_dict = {}
-    preds = list(zip(*items))[0]
-    docs = list(zip(*items))[1]
-    for doc, pred in zip(docs, preds):
-        question_id = doc["idx"]["question"]
-        if question_id not in question_scoring_dict:
-            question_scoring_dict[question_id] = []
-        gold_label = doc["label"] == 1
-        question_scoring_dict[question_id].append(gold_label == pred)
-    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
-    return acc
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    """Compute max metric between prediction and each ground truth."""
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-def weighted_mean(items):
-    a, b = zip(*items)
-    return sum(a) / sum(b)
-def is_non_str_iterable(obj):
-    return isinstance(obj, Iterable) and not isinstance(obj, str)
-def _sacreformat(refs, preds):
-    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
-    # Sacrebleu expects (List[str], List[List[str])
-    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
-    # Note [ref1_stream] is the first reference for each pred.
-    # So lists are size N and (M, N) for N preds and M possible refs for each pred
-    # This is a different order of dimensions that I would expect
-    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
-    # Must become List[List[str]] with the inner list corresponding to preds
-    if not is_non_str_iterable(refs):
-        refs = list(refs)
-    if not is_non_str_iterable(refs[0]):
-        refs = [[ref] for ref in refs]
-    refs = list(zip(*refs))
-    # Note the number of refs in each ref list much match the number of preds
-    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
-    if not is_non_str_iterable(preds):
-        preds = list(preds)
-    if is_non_str_iterable(preds[0]):
-        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
-        preds = [pred[0] for pred in preds]
-    return refs, preds
-# stderr stuff
-class _bootstrap_internal:
-    def __init__(self, f, n) -> None:
-        self.f = f
-        self.n = n
-    def __call__(self, v):
-        i, xs = v
-        rnd = random.Random()
-        rnd.seed(i)
-        res = []
-        for _ in range(self.n):
-            res.append(self.f(rnd.choices(xs, k=len(xs))))
-        return res
-def bootstrap_stderr(f, xs, iters):
-    import multiprocessing as mp
-    pool = mp.Pool(mp.cpu_count())
-    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
-    # equivalent to stderr calculated without Bessel's correction in the stddev.
-    # Unfortunately, I haven't been able to figure out what the right correction is
-    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
-    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
-    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
-    res = []
-    chunk_size = min(1000, iters)
-    from tqdm import tqdm
-    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(
-        pool.imap(
-            _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)],
-        ),
-        total=iters // chunk_size,
-    ):
-        # sample w replacement
-        res.extend(bootstrap)
-    pool.close()
-    return sample_stddev(res)
-def stderr_for_metric(metric, bootstrap_iters: int):
-    if bootstrap_iters <= 0:
-        # return no function (don't compute stderr) if bootstrap iters = 0
-        return None
-    bootstrappable = [
-        median,
-        matthews_corrcoef,
-        f1_score,
-        perplexity,
-        bleu,
-        chrf,
-        ter,
-        nanmean,
-    ]
-    if metric in bootstrappable:
-        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
-    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
-    return stderr.get(metric, None)
-def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
-    # Used to aggregate bootstrapped stderrs across subtasks in a group,
-    # when we are weighting by the size of each subtask.
-    #
-    assert len(stderrs) == len(sizes)
-    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
-    # and: https://stats.stackexchange.com/a/4841331
-    # this empirically seems to match running `stderr_for_metric` on all instances
-    # from the subtasks concatenated with each other.
-    pooled_sample_var = (
-        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
-    ) / (sum(sizes) - len(sizes))
-    return np.sqrt(pooled_sample_var / sum(sizes))
-def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
-    assert metrics is not None, (
-        "Need to pass a list of each subtask's metric for this stderr aggregation"
-    )
-    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
-    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
-    # This formula depends on sample means.
-    # removed because it seems to give erroneously huge stderrs for groupings of tasks
-    # and does not seem to match up with bootstrap-calculated stderrs for groups.
-    ### don't use this unless a statistician has told you it's the right thing to do ###
-    # accumulators: we'll aggregate pairwise N - 1 times
-    variance = stderrs[0] ** 2
-    curr_size = sizes[0]
-    curr_score = metrics[0]
-    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
-        curr_score = ((curr_score * curr_size) + (score * size)) / (
-            curr_size + size
-        )  # NOTE: this assumes our aggregation fn is "mean"
-        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
-            curr_size + size - 1
-        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
-            curr_score - score
-        ) ** 2
-    return np.sqrt(variance)
-def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
-    # A helper function that is used to aggregate
-    # subtask scores cross-task.
-    # TODO: does not hold for non-mean aggregations
-    if not weight_by_size:
-        sizes = [1] * len(sizes)
-    assert len(metrics) == len(sizes)
-    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)

lm-evaluation-harness/lm_eval/api/model.py DELETED Viewed

@@ -1,493 +0,0 @@
-import abc
-import hashlib
-import json
-import logging
-import os
-from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
-import transformers
-from sqlitedict import SqliteDict
-from tqdm import tqdm
-from lm_eval import utils
-eval_logger = logging.getLogger(__name__)
-T = TypeVar("T", bound="LM")
-class LM(abc.ABC):
-    def __init__(self) -> None:
-        """Defines the interface that should be implemented by all LM subclasses.
-        LMs are assumed to take text (strings) as input and yield strings as output
-        (inputs/outputs should be tokenization-agnostic.)
-        """
-        # set rank and world size to a single process, by default.
-        self._rank = 0
-        self._world_size = 1
-        self.cache_hook = CacheHook(None)
-    @abc.abstractmethod
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
-        """Compute log-likelihood of generating a continuation from a context.
-        Downstream tasks should attempt to use loglikelihood instead of other
-        LM calls whenever possible.
-        :param requests: list[Instance]
-            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
-            `context: str`
-                Context string. Implementations of LM must be able to handle an
-                empty context string.
-            `continuation: str`
-                The continuation over which log likelihood will be calculated. If
-                there is a word boundary, the space should be in the continuation.
-                For example, context="hello" continuation=" world" is correct.
-        :return: list[tuple[float, bool]]
-            A list of pairs (logprob, isgreedy)
-            `logprob: float`
-                The log probability of `continuation`.
-            `isgreedy`:
-                Whether `continuation` would be generated by greedy sampling from `context`.
-        """
-        pass
-    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[float]:
-        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
-        - We will use the full max context length of the model.
-        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
-        the max context length.
-        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
-          which may simply concatenate multiple documents together.
-        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
-          multiple chunks, the last input will still a full-sized context.
-          Example:
-            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
-            Prefix: BOS/EOS
-            Max context length: 4
-            Resulting input/prediction pairs:
-                INPUT:  BOS   0   1   2
-                PRED:     0   1   2   3
-                INPUT:    3   4   5   6
-                PRED:     4   5   6   7
-                INPUT:    5   6   7   8
-                PRED:             8   9
-          Observe that:
-            1. Each token is predicted exactly once
-            2. For the last pair, we provide the full context, but only score the last two tokens
-        :param requests: list[Instance]
-            A list of Instance objects with property `args` which returns a tuple (context,).
-            string: str
-                String for which we are computing overall loglikelihood
-        :return: list[tuple[float]]
-            A list of tuples (logprob,)
-            logprob: float
-                The log probability of `context` conditioned on the BOS/EOS token.
-                Can also be overridden for custom cases by `prefix_token_id`.
-        """
-        pass
-    # TODO: Add an optional max length
-    @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
-        """Generate greedily until a stopping sequence
-        :param requests: list[Instance]
-            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
-            context: str
-                Context string
-            gen_kwargs: dict
-                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
-        :return: list[str]
-            A list of model generated continuations.
-            continuation: str
-                The generated continuation.
-        """
-        pass
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
-    ) -> str:
-        """
-        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
-        :param chat_history: list[dict[str, str]]
-            A list of dictionaries with keys 'role' and 'content'.
-            Values are strings representing the role name and the content of the message, respectively.
-        :param add_generation_prompt: bool
-            Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
-        :return: str
-            A string representing the chat history in a format that can be used as input to the LM.
-        """
-        raise NotImplementedError(
-            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
-        )
-    @classmethod
-    def create_from_arg_string(
-        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
-    ) -> T:
-        """
-        Creates an instance of the LM class using the given argument string and additional config.
-        Parameters:
-        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
-        - additional_config: Optional dictionary containing additional configuration parameters.
-        Returns:
-        - Instance of the LM class.
-        """
-        additional_config = {} if additional_config is None else additional_config
-        args = utils.simple_parse_args_string(arg_string)
-        args2 = {k: v for k, v in additional_config.items() if v is not None}
-        return cls(**args, **args2)
-    @classmethod
-    def create_from_arg_obj(
-        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
-    ) -> T:
-        """
-        Creates an instance of the LM class using the given arg_obj
-        Parameters:
-        - arg_obj: A dict containing arguments in the format key1=value1,key2=value2.
-        - additional_config: Optional dictionary containing additional configuration parameters.
-        Returns:
-        - Instance of the LM class.
-        """
-        additional_config = {} if additional_config is None else additional_config
-        additional_config = {
-            k: v for k, v in additional_config.items() if v is not None
-        }
-        return cls(**arg_dict, **additional_config)
-    @property
-    def rank(self):
-        # used in the case of parallelism. Hardcoded to
-        # ensure no errors arise using API models which do
-        # not support multi-device parallelism nor expect it.
-        return self._rank
-    @property
-    def world_size(self):
-        # used in the case of parallelism. Hardcoded to
-        # ensure no errors arise using API models which do
-        # not support multi-device parallelism nor expect it.
-        return self._world_size
-    @property
-    def tokenizer_name(self) -> str:
-        """Must be defined for LM subclasses which implement Chat Templating.
-        Should return the name of the tokenizer or chat template used.
-        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
-        """
-        raise NotImplementedError(
-            "To use this model with chat templates, please implement the 'tokenizer_name' property."
-        )
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
-        """Returns the chat template structure for user/assistant messages if a template is provided.
-        This method is intended to be overridden in a subclass to define a specific chat template format.
-        For models that do not support chat templates, this method returns None by default.
-        """
-        return ""
-    def set_cache_hook(self, cache_hook) -> None:
-        self.cache_hook = cache_hook
-### SQLite-based caching of LM responses
-def hash_args(attr, args):
-    dat = json.dumps([attr] + list(args))
-    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
-class CacheHook:
-    def __init__(self, cachinglm) -> None:
-        if cachinglm is None:
-            self.dbdict = None
-            return
-        self.dbdict = cachinglm.dbdict
-    def add_partial(self, attr, req, res) -> None:
-        if self.dbdict is None:
-            return
-        hsh = hash_args(attr, req)
-        self.dbdict[hsh] = res
-class CachingLM:
-    def __init__(self, lm, cache_db) -> None:
-        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
-        :param lm: LM
-            Underlying LM
-        :param cache_db: str
-            Path to cache db
-        """
-        self.lm = lm
-        self.cache_db = cache_db
-        if os.path.dirname(cache_db):
-            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
-        self.dbdict = SqliteDict(cache_db, autocommit=True)
-        # add hook to lm
-        lm.set_cache_hook(self.get_cache_hook())
-    def __getattr__(self, attr: str):
-        lm_attr = getattr(self.lm, attr)
-        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
-            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
-            return lm_attr
-        def fn(requests):
-            res = []
-            remaining_reqs = []
-            warned = False
-            # figure out which ones are cached and which ones are new
-            eval_logger.info(
-                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
-            )
-            for req in tqdm(requests, desc="Checking cached requests"):
-                hsh = hash_args(attr, req.args)
-                if attr == "generate_until" and req.args[1].get("do_sample", False):
-                    # when we are doing non-greedy generation, don't use the cache
-                    # (else every "randomly sampled" generation would be identical for repeats > 1).
-                    if not warned:
-                        eval_logger.warning(
-                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
-                        )
-                        warned = True
-                    res.append(None)
-                    remaining_reqs.append(req)
-                elif hsh in self.dbdict:
-                    ob = self.dbdict[hsh]
-                    assert ob is not None
-                    res.append(ob)
-                else:
-                    res.append(None)
-                    remaining_reqs.append(req)
-            eval_logger.info(
-                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
-            )
-            if remaining_reqs:
-                # actually run the LM on the requests that do not have cached results
-                rem_res = getattr(self.lm, attr)(remaining_reqs)
-            else:
-                rem_res = []
-            # stick the new ones back into the list and also cache any of the new ones
-            resptr = 0
-            for req, r in zip(remaining_reqs, rem_res):
-                while res[resptr] is not None:
-                    resptr += 1
-                res[resptr] = r
-                # caching
-                hsh = hash_args(attr, req.args)
-                self.dbdict[hsh] = r
-            self.dbdict.commit()
-            return res
-        return fn
-    def get_cache_hook(self):
-        return CacheHook(self)
-class TemplateLM(LM):
-    """
-    A class acting as intermediary between the LM base class
-    and boilerplate often included in other LM subclasses.
-    """
-    tokenizer = None
-    @property
-    @abc.abstractmethod
-    def eot_token_id(self):
-        pass
-    @property
-    def prefix_token_id(self):
-        # it is used as prefix for loglikelihood
-        return self.eot_token_id
-    @abc.abstractmethod
-    def tok_encode(self, string: str, **kwargs) -> List[int]:
-        """
-        Tokenize a string using the model's tokenizer and return a list of token IDs.
-        """
-        pass
-    @abc.abstractmethod
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
-        pass
-    def _encode_pair(
-        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
-        if model_class == transformers.AutoModelForSeq2SeqLM:
-            context_enc = self.tok_encode(context)
-            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
-        else:
-            whole_enc = self.tok_encode(context + continuation)
-            context_enc = self.tok_encode(context)
-            context_enc_len = len(context_enc)
-            continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
-    def loglikelihood(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
-        new_reqs = []
-        for context, continuation in [req.args for req in requests]:
-            if context == "":
-                # BOS or EOS as context
-                context_enc, continuation_enc = (
-                    [self.prefix_token_id],
-                    self.tok_encode(continuation),
-                )
-            else:
-                context_enc, continuation_enc = self._encode_pair(context, continuation)
-            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
-    @abc.abstractmethod
-    def loglikelihood_rolling(
-        self, requests, disable_tqdm: bool = False
-    ) -> List[float]:
-        pass
-    @abc.abstractmethod
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        pass
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
-        """
-        Set and get the appropriate chat template for the model.
-        This method sets the tokenizer's chat_template and returns the template string for reproducibility.
-        The template selection logic is adapted from the Transformers library's `apply_chat_template`
-        method in the Tokenizer class. The original implementation can be found at:
-        https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
-        This method ensures that the right template is chosen based on the following:
-        0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
-        1. If the model's tokenizer has multiple templates:
-            a. Use the specified template if it exists in the dictionary.
-            b. Use the default template from the list if no specific template is provided.
-            c. Raise an error if no default template exists and no specific template is provided.
-        2. If the model's tokenizer has a single template or no template:
-            a. Use the tokenizer's chat template if available.
-            b. Fall back to the default chat template if no tokenizer chat template exists.
-        Args:
-            chat_template (Union[bool, str]): Specifies the chat template to use.
-                - If False or None, no template is applied.
-                - If True, the default or only available template is used.
-                - If a string, the template with the matching name is used.
-        Returns:
-            Optional[str]: The selected chat template, or None if no template is applied.
-        """
-        if self.tokenizer is None:
-            return ""
-        if chat_template is False or chat_template is None:
-            eval_logger.warning(
-                "model.chat_template was called with the chat_template set to False or None. "
-                "Therefore no chat template will be applied. Make sure this is an intended behavior."
-            )
-            return None
-        # Convert boolean chat_template to None to ensure compatibility with the adapted logic
-        if isinstance(chat_template, bool):
-            chat_template = None
-        using_default_template = False
-        # First, handle the cases when the model has a dict of multiple templates
-        try:
-            template = (
-                self.tokenizer.chat_template or self.tokenizer.default_chat_template
-            )
-        except AttributeError:
-            return None
-        if isinstance(template, dict):
-            using_default_dict = self.tokenizer.chat_template is None
-            if chat_template is not None:
-                if chat_template in template:
-                    selected_template = template[chat_template]
-                    if using_default_dict:
-                        using_default_template = True
-                else:
-                    raise ValueError(
-                        f"The specified chat template '{chat_template}' is not available. "
-                        f"Available template names are {sorted(template.keys())}."
-                    )
-            else:
-                # If user didn't pass a chat template, use the default template from the dict
-                if "default" in template:
-                    selected_template = template["default"]
-                    using_default_template = True
-                else:
-                    raise ValueError(
-                        "This model has multiple chat templates with no default specified! Please either pass a chat "
-                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
-                        f"template names are {sorted(template.keys())}."
-                    )
-        # Cases when the model has a single template or no template
-        else:
-            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
-            if isinstance(chat_template, str):
-                eval_logger.warning(
-                    "Chat template name provided, but the tokenizer's chat template is not a dictionary. "
-                    "Using the tokenizer's chat template or the default template instead."
-                )
-            if self.tokenizer.chat_template is not None:
-                selected_template = self.tokenizer.chat_template
-            else:
-                selected_template = self.tokenizer.default_chat_template
-                using_default_template = True
-        if using_default_template:
-            eval_logger.warning(
-                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
-                "very error-prone, because models are often trained with templates different from the class default! "
-                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
-                "point any code depending on them will stop working. We recommend setting a valid chat template before "
-                "then to ensure that this model continues working without issues."
-            )
-        return selected_template

lm-evaluation-harness/lm_eval/api/registry.py DELETED Viewed

@@ -1,196 +0,0 @@
-import logging
-from typing import Callable, Dict, Union
-import evaluate as hf_evaluate
-from lm_eval.api.model import LM
-eval_logger = logging.getLogger(__name__)
-MODEL_REGISTRY = {}
-def register_model(*names):
-    # either pass a list or a single alias.
-    # function receives them as a tuple of strings
-    def decorate(cls):
-        for name in names:
-            assert issubclass(cls, LM), (
-                f"Model '{name}' ({cls.__name__}) must extend LM class"
-            )
-            assert name not in MODEL_REGISTRY, (
-                f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
-            )
-            MODEL_REGISTRY[name] = cls
-        return cls
-    return decorate
-def get_model(model_name):
-    try:
-        return MODEL_REGISTRY[model_name]
-    except KeyError:
-        raise ValueError(
-            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
-        )
-TASK_REGISTRY = {}
-GROUP_REGISTRY = {}
-ALL_TASKS = set()
-func2task_index = {}
-def register_task(name):
-    def decorate(fn):
-        assert name not in TASK_REGISTRY, (
-            f"task named '{name}' conflicts with existing registered task!"
-        )
-        TASK_REGISTRY[name] = fn
-        ALL_TASKS.add(name)
-        func2task_index[fn.__name__] = name
-        return fn
-    return decorate
-def register_group(name):
-    def decorate(fn):
-        func_name = func2task_index[fn.__name__]
-        if name in GROUP_REGISTRY:
-            GROUP_REGISTRY[name].append(func_name)
-        else:
-            GROUP_REGISTRY[name] = [func_name]
-            ALL_TASKS.add(name)
-        return fn
-    return decorate
-OUTPUT_TYPE_REGISTRY = {}
-METRIC_REGISTRY = {}
-METRIC_AGGREGATION_REGISTRY = {}
-AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
-HIGHER_IS_BETTER_REGISTRY = {}
-FILTER_REGISTRY = {}
-DEFAULT_METRIC_REGISTRY = {
-    "loglikelihood": [
-        "perplexity",
-        "acc",
-    ],
-    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
-    "multiple_choice": ["acc", "acc_norm"],
-    "generate_until": ["exact_match"],
-}
-def register_metric(**args):
-    # TODO: do we want to enforce a certain interface to registered metrics?
-    def decorate(fn):
-        assert "metric" in args
-        name = args["metric"]
-        for key, registry in [
-            ("metric", METRIC_REGISTRY),
-            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
-            ("aggregation", METRIC_AGGREGATION_REGISTRY),
-        ]:
-            if key in args:
-                value = args[key]
-                assert value not in registry, (
-                    f"{key} named '{value}' conflicts with existing registered {key}!"
-                )
-                if key == "metric":
-                    registry[name] = fn
-                elif key == "aggregation":
-                    registry[name] = AGGREGATION_REGISTRY[value]
-                else:
-                    registry[name] = value
-        return fn
-    return decorate
-def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
-    if not hf_evaluate_metric:
-        if name in METRIC_REGISTRY:
-            return METRIC_REGISTRY[name]
-        else:
-            eval_logger.warning(
-                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
-            )
-    try:
-        metric_object = hf_evaluate.load(name)
-        return metric_object.compute
-    except Exception:
-        eval_logger.error(
-            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
-        )
-def register_aggregation(name: str):
-    def decorate(fn):
-        assert name not in AGGREGATION_REGISTRY, (
-            f"aggregation named '{name}' conflicts with existing registered aggregation!"
-        )
-        AGGREGATION_REGISTRY[name] = fn
-        return fn
-    return decorate
-def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
-    try:
-        return AGGREGATION_REGISTRY[name]
-    except KeyError:
-        eval_logger.warning(f"{name} not a registered aggregation metric!")
-def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
-    try:
-        return METRIC_AGGREGATION_REGISTRY[name]
-    except KeyError:
-        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
-def is_higher_better(metric_name) -> bool:
-    try:
-        return HIGHER_IS_BETTER_REGISTRY[metric_name]
-    except KeyError:
-        eval_logger.warning(
-            f"higher_is_better not specified for metric '{metric_name}'!"
-        )
-def register_filter(name):
-    def decorate(cls):
-        if name in FILTER_REGISTRY:
-            eval_logger.info(
-                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
-            )
-        FILTER_REGISTRY[name] = cls
-        return cls
-    return decorate
-def get_filter(filter_name: Union[str, Callable]) -> Callable:
-    try:
-        return FILTER_REGISTRY[filter_name]
-    except KeyError as e:
-        if callable(filter_name):
-            return filter_name
-        else:
-            eval_logger.warning(f"filter `{filter_name}` is not registered!")
-            raise e

lm-evaluation-harness/lm_eval/api/samplers.py DELETED Viewed

@@ -1,232 +0,0 @@
-import logging
-import warnings
-from functools import partial
-from typing import TYPE_CHECKING, Iterable, Optional, Union
-import datasets
-if TYPE_CHECKING:
-    from random import Random
-    from lm_eval.api.task import ConfigurableTask, Task
-eval_logger = logging.getLogger("lm-eval")
-class ContextSampler:
-    def __init__(
-        self,
-        docs: list[dict],
-        task: Union["Task", "ConfigurableTask"],
-        fewshot_indices: Optional[Iterable] = None,
-        rnd: Optional["Random"] = None,
-    ) -> None:
-        self.rnd = rnd
-        if not self.rnd:
-            raise ValueError(
-                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
-            )
-        self.task = task
-        self.config = task._config
-        self.target_delimiter = self.config.target_delimiter
-        self.fewshot_delimiter = self.config.fewshot_delimiter
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_text", None) is not None
-        ):
-            self.doc_to_text = partial(
-                self.task.doc_to_text,
-                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
-            )
-        else:
-            self.doc_to_text = self.task.doc_to_text
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_target", None) is not None
-        ):
-            self.doc_to_target = partial(
-                self.task.doc_to_target,
-                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
-            )
-        else:
-            self.doc_to_target = self.task.doc_to_target
-        if (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("doc_to_choice", None) is not None
-        ):
-            self.doc_to_choice = partial(
-                self.task.doc_to_choice,
-                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
-            )
-        else:
-            self.doc_to_choice = self.task.doc_to_choice
-        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
-        if fewshot_indices:  # subset few-shot docs from
-            if not isinstance(self.docs, datasets.Dataset):
-                raise ValueError(
-                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
-                )
-            self.docs = self.docs.select(fewshot_indices)
-    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
-        # draw an extra fewshot sample if using same split as evaluating on
-        prefix = gen_prefix + " " if gen_prefix else ""
-        n_samples = (
-            num_fewshot + 1
-            if self.config.fewshot_split == self.config.test_split
-            else num_fewshot
-        )
-        # draw `n_samples` docs from fewshot_docs
-        fewshotex = self.sample(n_samples)
-        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-        # TODO: should we just stop people from using fewshot from same split as evaluating?
-        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
-        labeled_examples = ""
-        for doc in selected_docs:
-            doc_content = self.doc_to_text(doc)
-            doc_target = self.doc_to_target(doc)
-            if self.config.doc_to_choice is None or isinstance(doc_content, str):
-                labeled_examples += doc_content
-            else:
-                labeled_examples += self.doc_to_choice(doc)[doc_content]
-            if doc_target != "":
-                if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
-                    # TODO: add logger warn once here.
-                    warnings.warn(
-                        "Both target_delimiter and target start with a space. This may cause issues.",
-                        Warning,
-                        stacklevel=2,
-                    )
-                labeled_examples += self.target_delimiter
-                labeled_examples += prefix
-                labeled_examples += (
-                    str(doc_target[0])
-                    if isinstance(doc_target, list)
-                    else doc_target
-                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
-                    else str(self.doc_to_choice(doc)[doc_target])
-                )
-                labeled_examples += self.fewshot_delimiter
-        return labeled_examples
-    def get_chat_context(
-        self,
-        doc: dict,
-        num_fewshot: int,
-        fewshot_as_multiturn: bool = False,
-        gen_prefix: Optional[str] = None,
-    ):
-        # TODO: Do we need any other delimiter
-        prefix = gen_prefix + " " if gen_prefix else ""
-        chat_history = []
-        # draw an extra fewshot sample if using same split as evaluating on
-        n_samples = (
-            num_fewshot + 1
-            if self.config.fewshot_split == self.config.test_split
-            else num_fewshot
-        )
-        # draw `n_samples` docs from fewshot_docs
-        fewshotex = self.sample(n_samples)
-        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-        # TODO: should we just stop people from using fewshot from same split as evaluating?
-        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
-        if fewshot_as_multiturn:
-            for doc in selected_docs:
-                doc_content = self.doc_to_text(doc)
-                doc_target = self.doc_to_target(doc)
-                chat_history.append(
-                    {
-                        "role": "user",
-                        "content": doc_content
-                        if self.config.doc_to_choice is None
-                        or isinstance(doc_content, str)
-                        else self.doc_to_choice(doc)[doc_content],
-                    }
-                )
-                chat_history.append(
-                    {
-                        "role": "assistant",
-                        "content": prefix + str(doc_target[0])
-                        if isinstance(doc_target, list)
-                        else prefix + doc_target
-                        if self.config.doc_to_choice is None
-                        or isinstance(doc_target, str)
-                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
-                    }
-                )
-        else:
-            # get fewshot context as one user turn
-            chat_history.append(
-                {
-                    "role": "user",
-                    "content": self.get_context(
-                        doc, num_fewshot, gen_prefix=gen_prefix
-                    ),
-                }
-            )
-        return chat_history
-    def sample(self, n: int):
-        """
-        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
-        """
-        return self.rnd.sample(self.docs, n)
-class FirstNSampler(ContextSampler):
-    def sample(self, n: int) -> None:
-        """
-        Draw the first `n` samples in order from the specified split.
-        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
-        """
-        assert n <= len(self.docs), (
-            f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
-        )
-        return self.docs[:n]
-class BalancedSampler(ContextSampler):
-    def sample(self, n: int) -> None:
-        """
-        TODO: this should return approximately class-balanced samples from our fewshot examples.
-        TODO: what order should they be in? maybe random?
-        """
-        pass
-class ManualSampler(ContextSampler):
-    def sample(self, n: int) -> None:
-        """ """
-        pass
-SAMPLER_REGISTRY = {
-    "default": ContextSampler,
-    "first_n": FirstNSampler,
-}
-def get_sampler(name: str):
-    try:
-        return SAMPLER_REGISTRY[name]
-    except KeyError:
-        raise ValueError(
-            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
-        )

lm-evaluation-harness/lm_eval/api/task.py DELETED Viewed

@@ -1,1879 +0,0 @@
-import abc
-import ast
-import logging
-import random
-import re
-from collections.abc import Callable
-from copy import deepcopy
-from dataclasses import asdict, dataclass
-from inspect import getsource
-from typing import (
-    Any,
-    Dict,
-    Iterable,
-    Iterator,
-    List,
-    Literal,
-    Mapping,
-    Optional,
-    Tuple,
-    Union,
-)
-import datasets
-import numpy as np
-from tqdm import tqdm
-from lm_eval import utils
-from lm_eval.api import samplers
-from lm_eval.api.instance import Instance, OutputType
-from lm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity
-from lm_eval.api.registry import (
-    AGGREGATION_REGISTRY,
-    DEFAULT_METRIC_REGISTRY,
-    get_aggregation,
-    get_metric,
-    get_metric_aggregation,
-    is_higher_better,
-)
-from lm_eval.caching.cache import load_from_cache, save_to_cache
-from lm_eval.filters import build_filter_ensemble
-from lm_eval.prompts import get_prompt
-ALL_OUTPUT_TYPES = [
-    "loglikelihood",
-    "multiple_choice",
-    "loglikelihood_rolling",
-    "generate_until",
-]
-eval_logger = logging.getLogger(__name__)
-@dataclass
-class TaskConfig(dict):
-    # task naming/registry
-    task: Optional[str] = None
-    task_alias: Optional[str] = None
-    tag: Optional[Union[str, list]] = None
-    # HF dataset options.
-    # which dataset to use,
-    # and what splits for what purpose
-    custom_dataset: Optional[Callable] = None
-    dataset_path: Optional[str] = None
-    dataset_name: Optional[str] = None
-    dataset_kwargs: Optional[dict] = None
-    training_split: Optional[str] = None
-    validation_split: Optional[str] = None
-    test_split: Optional[str] = None
-    fewshot_split: Optional[str] = (
-        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
-    )
-    # formatting / prompting options.
-    # see docs/advanced_task_guide.md for more info
-    process_docs: Optional[Callable] = None
-    doc_to_text: Optional[Union[Callable, str]] = None
-    doc_to_target: Optional[Union[Callable, str]] = None
-    doc_to_image: Union[Callable, str] = None
-    doc_to_audio: Union[Callable, str] = None
-    unsafe_code: bool = False
-    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
-    process_results: Optional[Union[Callable, str]] = None
-    use_prompt: Optional[str] = None
-    description: str = ""
-    target_delimiter: str = " "
-    fewshot_delimiter: str = "\n\n"
-    fewshot_config: Optional[dict] = None
-    # runtime configuration options
-    num_fewshot: Optional[int] = None
-    # scoring options
-    metric_list: Optional[list] = None
-    output_type: OutputType = "generate_until"
-    generation_kwargs: Optional[dict] = None
-    repeats: int = 1
-    filter_list: Optional[Union[str, list]] = None
-    should_decontaminate: bool = False
-    doc_to_decontamination_query: Optional[str] = None
-    gen_prefix: Optional[str] = None
-    metadata: Optional[dict] = (
-        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
-    )
-    def __post_init__(self) -> None:
-        if self.generation_kwargs is not None:
-            if self.output_type != "generate_until":
-                eval_logger.warning(
-                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
-                )
-            if "temperature" in self.generation_kwargs:
-                self.generation_kwargs["temperature"] = float(
-                    self.generation_kwargs["temperature"]
-                )
-            if "until" not in self.generation_kwargs:
-                eval_logger.warning(
-                    f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}"
-                )
-                self.generation_kwargs["until"] = [self.fewshot_delimiter]
-        else:
-            if self.output_type == "generate_until":
-                # ensure that we greedily generate in absence of explicit arguments otherwise
-                self.generation_kwargs = {
-                    "until": (
-                        None
-                        if self.fewshot_delimiter is None
-                        else [self.fewshot_delimiter]
-                    ),
-                    "do_sample": False,
-                    "temperature": 0,
-                }
-                eval_logger.warning(
-                    f"{self.task}: No `generation_kwargs` specified in task config, defaulting to {self.generation_kwargs}"
-                )
-    def __getitem__(self, item):
-        return getattr(self, item)
-    def __setitem__(self, item, value):
-        return setattr(self, item, value)
-    def to_dict(self, keep_callable: bool = False) -> dict:
-        """dumps the current config as a dictionary object, as a printable format.
-        null fields will not be printed.
-        Used for dumping results alongside full task configuration
-        :return: dict
-            A printable dictionary version of the TaskConfig object.
-        # TODO: should any default value in the TaskConfig not be printed?
-        """
-        cfg_dict = asdict(self)
-        # remove values that are `None`
-        for k, v in list(cfg_dict.items()):
-            if v is None:
-                cfg_dict.pop(k)
-            elif k == "metric_list":
-                for metric_dict in v:
-                    for metric_key, metric_value in metric_dict.items():
-                        if callable(metric_value):
-                            metric_dict[metric_key] = self.serialize_function(
-                                metric_value, keep_callable=keep_callable
-                            )
-                cfg_dict[k] = v
-            elif callable(v):
-                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
-        return cfg_dict
-    def serialize_function(
-        self, value: Union[Callable, str], keep_callable=False
-    ) -> Union[Callable, str]:
-        """Serializes a given function or string.
-        If 'keep_callable' is True, the original callable is returned.
-        Otherwise, attempts to return the source code of the callable using 'getsource'.
-        """
-        if keep_callable:
-            return value
-        else:
-            try:
-                return getsource(value)
-            except (TypeError, OSError):
-                return str(value)
-class Task(abc.ABC):
-    """A task represents an entire benchmark including its dataset, problems,
-    answers, and evaluation methods. See BoolQ for a simple example implementation
-    A `doc` can be any python object which represents one instance of evaluation.
-    This is usually a dictionary e.g.
-        {"question": ..., "answer": ...} or
-        {"question": ..., question, answer)
-    """
-    VERSION: Optional[Union[int, str]] = None
-    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
-    # or a path to a custom `datasets` loading script.
-    DATASET_PATH: Optional[str] = None
-    # The name of a subset within `DATASET_PATH`.
-    DATASET_NAME: Optional[str] = None
-    OUTPUT_TYPE: Optional[OutputType] = None
-    def __init__(
-        self,
-        data_dir: Optional[str] = None,
-        cache_dir: Optional[str] = None,
-        download_mode: Optional[datasets.DownloadMode] = None,
-        config: Optional[Mapping] = None,  # Union[dict, TaskConfig]
-    ) -> None:
-        """
-        :param data_dir: str
-            Stores the path to a local folder containing the `Task`'s data files.
-            Use this to specify the path to manually downloaded data (usually when
-            the dataset is not publicly accessible).
-        :param cache_dir: str
-            The directory to read/write the `Task` dataset. This follows the
-            HuggingFace `datasets` API with the default cache directory located at:
-                `~/.cache/huggingface/datasets`
-            NOTE: You can change the cache location globally for a given process
-            to another directory:
-                `export HF_DATASETS_CACHE="/path/to/another/directory"`
-        :param download_mode: datasets.DownloadMode
-            How to treat pre-existing `Task` downloads and data.
-            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
-                Reuse download and reuse dataset.
-            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
-                Reuse download with fresh dataset.
-            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
-                Fresh download and fresh dataset.
-        """
-        self.download(data_dir, cache_dir, download_mode)
-        self._training_docs: Optional[list] = None
-        self._fewshot_docs: Optional[list] = None
-        self._instances: Optional[List[Instance]] = None
-        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
-        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-        self.fewshot_rnd: Optional[random.Random] = (
-            None  # purposely induce errors in case of improper usage
-        )
-    def download(
-        self,
-        data_dir: Optional[str] = None,
-        cache_dir: Optional[str] = None,
-        download_mode=None,
-    ) -> None:
-        """Downloads and returns the task dataset.
-        Override this method to download the dataset from a custom API.
-        :param data_dir: str
-            Stores the path to a local folder containing the `Task`'s data files.
-            Use this to specify the path to manually downloaded data (usually when
-            the dataset is not publicly accessible).
-        :param cache_dir: str
-            The directory to read/write the `Task` dataset. This follows the
-            HuggingFace `datasets` API with the default cache directory located at:
-                `~/.cache/huggingface/datasets`
-            NOTE: You can change the cache location globally for a given process
-            by setting the shell environment variable, `HF_DATASETS_CACHE`,
-            to another directory:
-                `export HF_DATASETS_CACHE="/path/to/another/directory"`
-        :param download_mode: datasets.DownloadMode
-            How to treat pre-existing `Task` downloads and data.
-            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
-                Reuse download and reuse dataset.
-            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
-                Reuse download with fresh dataset.
-            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
-                Fresh download and fresh dataset.
-        """
-        self.dataset = datasets.load_dataset(
-            path=self.DATASET_PATH,
-            name=self.DATASET_NAME,
-            data_dir=data_dir,
-            cache_dir=cache_dir,
-            download_mode=download_mode,
-        )
-    @property
-    def config(self) -> TaskConfig:
-        """Returns the TaskConfig associated with this class."""
-        return self._config
-    @abc.abstractmethod
-    def has_training_docs(self):
-        """Whether the task has a training set"""
-        pass
-    @abc.abstractmethod
-    def has_validation_docs(self):
-        """Whether the task has a validation set"""
-        pass
-    @abc.abstractmethod
-    def has_test_docs(self):
-        """Whether the task has a test set"""
-        pass
-    def training_docs(self) -> Iterable:
-        """
-        :return: Iterable[obj]
-            A iterable of any object, that doc_to_text can handle
-        """
-        return []
-    def validation_docs(self) -> Iterable:
-        """
-        :return: Iterable[obj]
-            A iterable of any object, that doc_to_text can handle
-        """
-        return []
-    def test_docs(self) -> Iterable:
-        """
-        :return: Iterable[obj]
-            A iterable of any object, that doc_to_text can handle
-        """
-        return []
-    def fewshot_docs(self) -> Iterable:
-        """
-        :return: Iterable[obj]
-            A iterable of any object, that doc_to_text can handle
-        """
-        if self.has_training_docs():
-            return self.training_docs()
-        elif self.has_validation_docs():
-            return self.validation_docs()
-        else:
-            if self.config.get("num_fewshot", 0) > 0:
-                eval_logger.warning(
-                    f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
-                    ", using test_docs as fewshot_docs but this is not recommended."
-                )
-            return self.test_docs()
-    def _process_doc(self, doc: dict) -> dict:
-        """
-        Override this to process (detokenize, strip, replace, etc.) individual
-        documents. This can be used in a map over documents of a data split.
-        E.g. `map(self._process_doc, self.dataset["validation"])`
-        :return: dict
-            The processed version of the specified `doc`.
-        """
-        return doc
-    @property
-    def instances(self) -> List[Instance]:
-        """After calling `task.build_all_requests()`, tasks
-        maintain a list of the dataset instances which will be evaluated.
-        """
-        return self._instances
-    def fewshot_examples(self, k, rnd):
-        if self._training_docs is None:
-            self._training_docs = list(self.training_docs())
-        return rnd.sample(self._training_docs, k)
-    def doc_to_decontamination_query(self, doc):
-        raise NotImplementedError(
-            "Override doc_to_decontamination_query with document specific decontamination query."
-        )
-    @abc.abstractmethod
-    def doc_to_text(self, doc):
-        pass
-    @abc.abstractmethod
-    def doc_to_target(self, doc):
-        pass
-    # not an abstractmethod because not every language-only task has to implement this
-    def doc_to_image(self, doc):
-        raise NotImplementedError
-    def doc_to_audio(self, doc):
-        raise NotImplementedError
-    def doc_to_prefix(self, doc):
-        return ""
-    def build_all_requests(
-        self,
-        *,
-        limit: Union[int, None] = None,
-        samples: Optional[List[int]] = None,
-        rank: int = 0,
-        world_size: int = 1,
-        cache_requests: bool = False,
-        rewrite_requests_cache: bool = False,
-        system_instruction: Optional[str] = None,
-        apply_chat_template: bool = False,
-        fewshot_as_multiturn: bool = False,
-        chat_template: Optional[Callable] = None,
-        tokenizer_name: str = "",
-    ) -> None:
-        """Build a set of Instances for a task, and store them in task.instances"""
-        # used with caching
-        og_limit = limit
-        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
-        cache_key += "-chat_template" if apply_chat_template else ""
-        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
-        cache_key += (
-            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
-            if system_instruction is not None
-            else ""
-        )
-        cache_key += f"-tokenizer{tokenizer_name}"
-        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
-        if cache_requests and cached_instances and not rewrite_requests_cache:
-            cached_instances = cached_instances[:limit]
-            flattened_instances = [
-                instance
-                for instance_group in cached_instances
-                for instance in instance_group
-            ]
-            self._instances = flattened_instances
-            return
-        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
-        instances = []
-        # process all documents when caching is specified for simplicity
-        if (
-            cache_requests
-            and (not cached_instances or rewrite_requests_cache)
-            and limit is not None
-        ):
-            limit = None
-        doc_id_docs = list(
-            self.doc_iterator(
-                rank=rank, limit=limit, samples=samples, world_size=world_size
-            )
-        )
-        num_docs = len(doc_id_docs)
-        for doc_id, doc in tqdm(
-            doc_id_docs,
-            total=num_docs,
-        ):
-            # sample fewshot context #TODO: need to offset doc_id by rank now!
-            fewshot_ctx = self.fewshot_context(
-                doc,
-                0 if self.config.num_fewshot is None else self.config.num_fewshot,
-                system_instruction,
-                apply_chat_template,
-                fewshot_as_multiturn,
-                chat_template,
-                gen_prefix=self.doc_to_prefix(doc),
-            )
-            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
-            inst = self.construct_requests(
-                doc=doc,
-                ctx=fewshot_ctx,
-                metadata=(self.config["task"], doc_id, self.config.repeats),
-                apply_chat_template=apply_chat_template,
-                chat_template=chat_template,
-            )
-            if not isinstance(inst, list):
-                inst = [inst]
-            instances.append(inst)
-        # now flatten, this is to allow slicing to work with pickles
-        sliced_instances = instances[:og_limit]
-        flattened_instances = [
-            instance
-            for instance_group in sliced_instances
-            for instance in instance_group
-        ]
-        self._instances = flattened_instances
-        if len(self._instances) == 0:
-            raise ValueError("task.build_requests() did not find any docs!")
-        if cache_requests and (not cached_instances or rewrite_requests_cache):
-            save_to_cache(file_name=cache_key, obj=instances)
-    @abc.abstractmethod
-    def construct_requests(self, doc, ctx, **kwargs):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        :param doc_idx: int
-            The index of a document within `self.test_docs()` or `self.validation_docs()`,
-            whichever is the main split used.
-        :param repeats: int
-        TODO: update this docstring
-            The number of times each instance in a dataset is inferred on. Defaults to 1,
-            can be increased for techniques like majority voting.
-        """
-        pass
-    @abc.abstractmethod
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        pass
-    @abc.abstractmethod
-    def aggregation(self):
-        """
-        :returns: {str: [metric_score] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metric scores
-        """
-        pass
-    @abc.abstractmethod
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        pass
-    def get_config(self, key: str) -> Any:
-        return getattr(self._config, key, None)
-    @classmethod
-    def count_bytes(cls, doc):
-        """Used for byte-level perplexity metrics in rolling loglikelihood"""
-        return len(doc.encode("utf-8"))
-    @classmethod
-    def count_words(cls, doc):
-        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
-        return len(re.split(r"\s+", doc))
-    @utils.positional_deprecated
-    def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
-        """Returns a fewshot context string that is made up of a prepended description
-        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
-        :param doc: str
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param num_fewshot: int
-            The number of fewshot examples to provide in the returned context string.
-        :param rnd: random.Random
-            The pseudo-random number generator used to randomly sample examples.
-            WARNING: This is currently a required arg although it's optionalized with a default `None`.
-        :param description: str
-            The task's description that will be prepended to the fewshot examples.
-        :returns: str
-            The fewshot context.
-        """
-        if rnd is None:
-            if self.fewshot_rnd is not None:
-                rnd = self.fewshot_rnd
-            else:
-                raise ValueError(
-                    "A `random.Random` generator argument must be provided to `rnd`"
-                )
-        description = description if description else ""
-        if num_fewshot == 0:
-            labeled_examples = ""
-        else:
-            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
-            if self.has_training_docs():
-                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
-            else:
-                if self._fewshot_docs is None:
-                    self._fewshot_docs = list(
-                        self.validation_docs()
-                        if self.has_validation_docs()
-                        else self.test_docs()
-                    )
-                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
-                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
-                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
-            labeled_examples = (
-                "\n\n".join(
-                    [
-                        self.doc_to_text(doc) + self.doc_to_target(doc)
-                        for doc in fewshotex
-                    ]
-                )
-                + "\n\n"
-            )
-        example = self.doc_to_text(doc)
-        return description + labeled_examples + example
-    def apply_filters(self) -> Optional[List[Instance]]:
-        """Iterates over FilterEnsembles and applies them to instances"""
-        if hasattr(self, "_filters"):
-            for f in self._filters:
-                f.apply(self._instances)
-        else:
-            eval_logger.warning("No filter defined, passing through instances")
-            return self._instances
-    def dump_config(self) -> dict:
-        """Returns the config as a dictionary."""
-        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
-        # (num_fewshot)
-        return self.config.to_dict()
-    def set_config(self, key: str, value: Any, update: bool = False) -> None:
-        """Set or update the configuration for a given key."""
-        if key is None:
-            raise ValueError("Key must be provided.")
-        if update:
-            current_value = getattr(self._config, key, {})
-            if not isinstance(current_value, dict):
-                raise TypeError(
-                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
-                )
-            current_value.update(value)
-        else:
-            setattr(self._config, key, value)
-    def override_metric(self, metric_name: str) -> None:
-        """
-        Override the default metrics used for evaluation with custom metrics.
-        Parameters:
-        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
-        """
-        (
-            self._metric_fn_list,
-            self._aggregation_list,
-            self._metric_fn_kwargs,
-            self._higher_is_better,
-        ) = ({}, {}, {}, {})
-        self._metric_fn_list[metric_name] = get_metric(metric_name)
-        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
-        self._higher_is_better[metric_name] = is_higher_better(metric_name)
-        self._metric_fn_kwargs[metric_name] = {}
-        if not isinstance(self, ConfigurableTask):
-            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
-            self.aggregation = lambda: {
-                metric_name: get_metric_aggregation(metric_name)
-            }
-        setattr(self._config, "metric_list", [{"metric": metric_name}])
-        setattr(self._config, "process_results", None)
-    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
-        self.fewshot_rnd = random.Random(seed)
-        if hasattr(self, "sampler"):
-            self.sampler.rnd = self.fewshot_rnd
-    @property
-    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
-        if self.has_test_docs():
-            return self.test_docs()
-        elif self.has_validation_docs():
-            return self.validation_docs()
-        else:
-            raise ValueError(
-                f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
-            )
-    def doc_iterator(
-        self,
-        *,
-        rank: int = 0,
-        limit: Union[int, None] = None,
-        world_size: int = 1,
-        samples: Optional[List[int]] = None,
-    ) -> Iterator[Tuple[int, Any]]:
-        if samples:
-            n = len(self.eval_docs)
-            assert all([e < n for e in samples]), (
-                f"Elements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k={n}."
-            )
-            eval_logger.info(
-                f"{self.config.task}: Evaluating on {len(samples)} examples"
-            )
-            doc_iterator = utils.create_iterator(
-                enumerate(x for i, x in enumerate(self.eval_docs) if i in samples),
-                rank=int(rank),
-                limit=None,  # limit does not matter here since we are selecting samples directly
-                world_size=int(world_size),
-            )
-        else:
-            limit = int(limit) if limit else None
-            doc_iterator = utils.create_iterator(
-                enumerate(self.eval_docs),
-                rank=int(rank),
-                limit=limit,
-                world_size=int(world_size),
-            )
-        return doc_iterator
-class ConfigurableTask(Task):
-    VERSION = "Yaml"
-    OUTPUT_TYPE = None
-    CONFIG = None
-    def __init__(
-        self,
-        data_dir=None,
-        cache_dir=None,
-        download_mode=None,
-        config: Optional[dict] = None,
-    ) -> None:  # TODO no super() call here
-        # Get pre-configured attributes
-        self._config = self.CONFIG
-        # Use new configurations if there was no preconfiguration
-        if self.config is None:
-            self._config = TaskConfig(**config)
-        # Overwrite configs
-        else:
-            if config is not None:
-                self._config.__dict__.update(config)
-        if self.config is None:
-            raise ValueError(
-                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
-            )
-        if isinstance(self.config.metadata, dict):
-            if "version" in self.config.metadata:
-                self.VERSION = self.config.metadata["version"]
-        if self.config.output_type is not None:
-            if self.config.output_type not in ALL_OUTPUT_TYPES:
-                raise ValueError(
-                    f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'"
-                )
-            self.OUTPUT_TYPE = self.config.output_type
-        if self.config.doc_to_image is not None:
-            # mark the task as requiring multimodality.
-            self.MULTIMODAL = True
-        if self.config.doc_to_audio:
-            # mark the task as requiring multimodality.
-            self.MULTIMODAL = True
-        if self.config.unsafe_code is not False:
-            self.UNSAFE_CODE = True
-        if self.config.dataset_path is not None:
-            self.DATASET_PATH = self.config.dataset_path
-        if self.config.dataset_name is not None:
-            self.DATASET_NAME = self.config.dataset_name
-        self._metric_fn_list = {}
-        self._metric_fn_kwargs = {}
-        self._aggregation_list = {}
-        self._higher_is_better = {}
-        if self.config.metric_list is None:
-            # TODO: handle this in TaskConfig.__post_init__ ?
-            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
-            for metric_name in _metric_list:
-                self._metric_fn_list[metric_name] = get_metric(metric_name)
-                self._metric_fn_kwargs[metric_name] = {}
-                self._aggregation_list[metric_name] = get_metric_aggregation(
-                    metric_name
-                )
-                self._higher_is_better[metric_name] = is_higher_better(metric_name)
-        else:
-            for metric_config in self.config.metric_list:
-                if "metric" not in metric_config:
-                    raise ValueError(
-                        "'metric' key not provided for an entry in 'metric_list', must be specified!"
-                    )
-                metric_name = metric_config["metric"]
-                kwargs = {
-                    key: metric_config[key]
-                    for key in metric_config
-                    if key
-                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
-                }
-                hf_evaluate_metric = (
-                    "hf_evaluate" in metric_config
-                    and metric_config["hf_evaluate"] is True
-                )
-                if self.config.process_results is not None:
-                    self._metric_fn_list[metric_name] = None
-                    self._metric_fn_kwargs[metric_name] = {}
-                elif callable(metric_name):
-                    metric_fn = metric_name.__call__
-                    metric_name = metric_name.__name__
-                    self._metric_fn_list[metric_name] = metric_fn
-                    self._metric_fn_kwargs[metric_name] = kwargs
-                else:
-                    self._metric_fn_list[metric_name] = get_metric(
-                        metric_name, hf_evaluate_metric
-                    )
-                    self._metric_fn_kwargs[metric_name] = kwargs
-                if "aggregation" in metric_config:
-                    agg_name = metric_config["aggregation"]
-                    if isinstance(agg_name, str):
-                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
-                    elif callable(agg_name):  # noqa: E721
-                        self._aggregation_list[metric_name] = metric_config[
-                            "aggregation"
-                        ]
-                else:
-                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
-                    metric_agg = get_metric_aggregation(metric_name)
-                    eval_logger.warning(
-                        f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. "
-                        f"using default "
-                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
-                    )
-                    self._aggregation_list[metric_name] = metric_agg
-                if "higher_is_better" in metric_config:
-                    self._higher_is_better[metric_name] = metric_config[
-                        "higher_is_better"
-                    ]
-                else:
-                    eval_logger.warning(
-                        f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. "
-                        f"using default "
-                        f"higher_is_better={is_higher_better(metric_name)}"
-                    )
-                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
-        self.download(self.config.dataset_kwargs)
-        self._training_docs = None
-        self._fewshot_docs = None
-        if self.config.filter_list is not None:
-            self._filters = []
-            for filter_config in self.config.filter_list:
-                filter_name = filter_config["name"]
-                filter_functions = filter_config["filter"]
-                components = []
-                for function in filter_functions:
-                    kwargs = {
-                        key: function[key] for key in function if key != "function"
-                    }
-                    components.append([function["function"], kwargs])
-                filter_pipeline = build_filter_ensemble(filter_name, components)
-                self._filters.append(filter_pipeline)
-        else:
-            # TODO: handle repeats in a more general way rather than just discarding
-            eval_logger.debug(
-                "No custom filters defined. Using default 'take_first' filter for handling repeats."
-            )
-            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
-        if self.config.use_prompt is not None:
-            eval_logger.info(f"loading prompt {self.config.use_prompt}")
-            self.prompt = get_prompt(
-                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
-            )
-        else:
-            self.prompt = None
-        if self.fewshot_docs() is not None:
-            self.fewshot_rnd = (
-                random.Random()
-            )  # setting with no seed, to be overridden at a later time
-            config_sampler: Union[str, Callable] = (
-                self.config.fewshot_config.get("sampler", "default")
-                if self.config.fewshot_config
-                else "default"
-            )
-            if isinstance(config_sampler, str):
-                self.sampler = samplers.get_sampler(config_sampler)(
-                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
-                )
-            elif callable(config_sampler) and issubclass(
-                config_sampler, samplers.ContextSampler
-            ):
-                self.sampler = config_sampler(
-                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
-                )
-            else:
-                raise TypeError(
-                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
-                    f"not {type(config_sampler)}"
-                )
-        self.task_docs = self.eval_docs
-        # Test One Doc
-        self.features = list(self.task_docs.features.keys())
-        self.multiple_input = 0
-        self.multiple_target = 0
-        test_doc = self.task_docs[0]
-        test_text = self.doc_to_text(test_doc)
-        test_target = self.doc_to_target(test_doc)
-        if self.config.doc_to_choice is not None:
-            test_choice = self.doc_to_choice(test_doc)
-            if not isinstance(test_choice, list):
-                eval_logger.error("doc_to_choice must return list")
-            else:
-                num_choice = len(test_choice)
-            if isinstance(test_text, int):
-                eval_logger.debug(
-                    "doc_to_text returned an int. Assuming multiple inputs."
-                )
-                self.multiple_input = num_choice
-        else:
-            test_choice = None
-        if isinstance(test_target, list):
-            eval_logger.debug(
-                "doc_to_target returned a list. Assuming multiple targets."
-            )
-            self.multiple_target = len(test_target)
-        else:
-            if (isinstance(test_target, int)) and (test_choice is not None):
-                test_target = test_choice[test_target]
-            else:
-                test_target = str(test_target)
-        if test_choice is not None:
-            check_choices = test_choice
-        else:
-            check_choices = [test_target]
-        if self.config.doc_to_choice is not None:
-            for choice in check_choices:
-                choice_has_whitespace = True if choice[0].isspace() else False
-                delimiter_has_whitespace = (
-                    True
-                    if self.config.target_delimiter.rstrip()
-                    != self.config.target_delimiter
-                    else False
-                )
-                if delimiter_has_whitespace and choice_has_whitespace:
-                    eval_logger.debug(
-                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
-                    )
-                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
-                    eval_logger.debug(
-                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
-                    )
-    def download(
-        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
-    ) -> None:
-        if isinstance(self.config.custom_dataset, Callable):
-            eval_logger.warning(
-                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
-                + "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme."
-            )
-            self.dataset = self.config.custom_dataset(
-                **(self.config.metadata or {}), **(self.config.dataset_kwargs or {})
-            )
-        else:
-            self.dataset = datasets.load_dataset(
-                path=self.DATASET_PATH,
-                name=self.DATASET_NAME,
-                **dataset_kwargs if dataset_kwargs is not None else {},
-            )
-    def has_training_docs(self) -> bool:
-        if self.config.training_split is not None:
-            return True
-        else:
-            return False
-    def has_validation_docs(self) -> bool:
-        if self.config.validation_split is not None:
-            return True
-        else:
-            return False
-    def has_test_docs(self) -> bool:
-        if self.config.test_split is not None:
-            return True
-        else:
-            return False
-    def training_docs(self) -> datasets.Dataset:
-        if self.has_training_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(
-                    self.dataset[self.config.training_split]
-                )
-            return self.dataset[self.config.training_split]
-    def validation_docs(self) -> datasets.Dataset:
-        if self.has_validation_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(
-                    self.dataset[self.config.validation_split]
-                )
-            return self.dataset[self.config.validation_split]
-    def test_docs(self) -> datasets.Dataset:
-        if self.has_test_docs():
-            if self.config.process_docs is not None:
-                return self.config.process_docs(self.dataset[self.config.test_split])
-            return self.dataset[self.config.test_split]
-    def fewshot_docs(self):
-        if self.config.fewshot_split is not None:
-            if self.config.process_docs is not None:
-                return self.config.process_docs(self.dataset[self.config.fewshot_split])
-            return self.dataset[self.config.fewshot_split]
-        elif (
-            self.config.fewshot_config is not None
-            and self.config.fewshot_config.get("samples", None) is not None
-        ):
-            if isinstance(self.config.fewshot_config["samples"], list):
-                return self.config.fewshot_config["samples"]
-            elif callable(self.config.fewshot_config["samples"]):
-                return self.config.fewshot_config["samples"]()
-            else:
-                raise Exception(
-                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
-                )
-        else:
-            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
-                eval_logger.warning(
-                    f"[Task: {self.config.task}] "
-                    "num_fewshot > 0 but fewshot_split is None. "
-                    "using preconfigured rule."
-                )
-            return super().fewshot_docs()
-    @staticmethod
-    def append_target_question(
-        labeled_examples: List[Dict[str, str]],
-        question: str,
-        fewshot_as_multiturn: bool = False,
-        gen_prefix: Optional[str] = None,
-    ) -> None:
-        """Adds a target question to the labeled examples list.
-        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
-        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
-        """
-        if not fewshot_as_multiturn:
-            # if no messages or last message is system, append as new user entry
-            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
-                labeled_examples.append({"role": "user", "content": question})
-            # if last message is user, append to it to avoid two user messages in a row
-            else:
-                labeled_examples[-1]["content"] += question
-        else:
-            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
-            labeled_examples.append({"role": "user", "content": question})
-        if gen_prefix:
-            labeled_examples.append({"role": "assistant", "content": gen_prefix})
-    @utils.positional_deprecated
-    def fewshot_context(
-        self,
-        doc: dict,
-        num_fewshot: int,
-        system_instruction: Optional[str] = None,
-        apply_chat_template: bool = False,
-        fewshot_as_multiturn: bool = False,
-        chat_template: Optional[Callable] = None,
-        gen_prefix: Optional[str] = None,
-    ) -> Union[str, List[str]]:
-        """Returns a fewshot context string that is made up of a prepended description
-        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
-        :param doc: str
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param num_fewshot: int
-            The number of fewshot examples to provide in the returned context string.
-        :param  system_instruction: str
-            System instruction to be applied to the prompt.
-        :param apply_chat_template: bool
-            Whether to apply the chat template to the fewshot context.
-        :param fewshot_as_multiturn: bool
-            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-        :param chat_template:
-            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
-        :param gen_prefix:
-            String to append after the <|assistant|> token.
-        :returns: str
-            The fewshot context.
-        """
-        if apply_chat_template:
-            labeled_examples = []
-        else:
-            labeled_examples = ""
-        # get task description
-        if description := self.config.description:
-            description = utils.apply_template(self.config.description, doc)
-        # create system prompt based on the provided system instruction and description
-        if system_instruction is not None and description:
-            system_prompt = (
-                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
-            )
-        elif system_instruction is not None:
-            system_prompt = system_instruction
-        elif description:
-            system_prompt = description
-        else:
-            system_prompt = ""
-        # add system prompt if specified
-        if system_prompt:
-            if apply_chat_template:
-                labeled_examples.append({"role": "system", "content": system_prompt})
-            else:
-                labeled_examples = system_prompt
-        # if few-shot - append examples after the system prompt
-        if num_fewshot > 0:
-            if apply_chat_template:
-                labeled_examples.extend(
-                    self.sampler.get_chat_context(
-                        doc,
-                        num_fewshot,
-                        fewshot_as_multiturn,
-                        gen_prefix=gen_prefix,
-                    )
-                )
-            else:
-                labeled_examples += self.sampler.get_context(
-                    doc, num_fewshot, gen_prefix=gen_prefix
-                )
-        example = self.doc_to_text(doc)
-        if apply_chat_template:
-            if self.multiple_input:
-                # TODO: append prefill?
-                if not labeled_examples:
-                    return ""
-                return chat_template(labeled_examples)
-            if isinstance(example, str):
-                self.append_target_question(
-                    labeled_examples,
-                    example,
-                    fewshot_as_multiturn,
-                    gen_prefix=gen_prefix,
-                )
-            # for loglikelihood create a list of questions with appended choices
-            elif isinstance(example, list):
-                labeled_examples_list = []
-                # copy chat history for each example and append the answer
-                for ex in example:
-                    chat = deepcopy(labeled_examples)
-                    self.append_target_question(
-                        chat,
-                        ex,
-                        fewshot_as_multiturn,
-                        gen_prefix=gen_prefix,
-                    )
-                    # TODO: append prefill?
-                    labeled_examples_list.append(
-                        chat_template(
-                            chat,
-                            add_generation_prompt=False if gen_prefix else True,
-                        )
-                    )
-                return labeled_examples_list
-            # if example is an integer, append the choice or convert to string
-            elif isinstance(example, int):
-                if self.config.doc_to_choice is not None:
-                    choices = self.doc_to_choice(doc)
-                    self.append_target_question(
-                        labeled_examples,
-                        choices[example],
-                        fewshot_as_multiturn,
-                        gen_prefix=gen_prefix,
-                    )
-                else:
-                    self.append_target_question(
-                        labeled_examples,
-                        str(example),
-                        fewshot_as_multiturn,
-                        gen_prefix=gen_prefix,
-                    )
-                # return lm.apply_chat_template(labeled_examples)
-            return chat_template(
-                labeled_examples,
-                add_generation_prompt=False if gen_prefix else True,
-            )
-        else:
-            prefix = (
-                self.config.target_delimiter + gen_prefix
-                if gen_prefix is not None
-                else ""
-            )
-            if self.multiple_input:
-                return labeled_examples
-            if isinstance(example, str):
-                return labeled_examples + example + prefix
-            elif isinstance(example, list):
-                return [labeled_examples + ex + prefix for ex in example]
-            elif isinstance(example, int):
-                if self.config.doc_to_choice is not None:
-                    choices = self.doc_to_choice(doc)
-                    return labeled_examples + choices[example] + prefix
-                else:
-                    return labeled_examples + str(example) + prefix
-    def apply_filters(self) -> Optional[List[Instance]]:
-        """Iterates over FilterEnsembles and applies them to instances"""
-        if hasattr(self, "_filters"):
-            for f in self._filters:
-                f.apply(self._instances)
-        else:
-            eval_logger.warning("No filter defined, passing through instances")
-            return self._instances
-    def should_decontaminate(self):
-        return self.config.should_decontaminate
-    def doc_to_decontamination_query(self, doc: dict):
-        if self.config.should_decontaminate:
-            if self.config.doc_to_decontamination_query is None:
-                return self.doc_to_text(doc)
-            else:
-                doc_to_decontamination_query = self.config.doc_to_decontamination_query
-                if doc_to_decontamination_query in self.features:
-                    return doc[doc_to_decontamination_query]
-                elif callable(doc_to_decontamination_query):
-                    return doc_to_decontamination_query(doc)
-                else:
-                    return ast.literal_eval(
-                        utils.apply_template(
-                            self.config.doc_to_decontamination_query, doc
-                        )
-                    )
-    def _process_doc(self, doc: dict) -> dict:
-        """
-        Override this to process (detokenize, strip, replace, etc.) individual
-        documents. This can be used in a map over documents of a data split.
-        E.g. `map(self._process_doc, self.dataset["validation"])`
-        :return: dict
-            The processed version of the specified `doc`.
-        """
-        return doc
-    def doc_to_text(self, doc, doc_to_text=None):
-        if self.prompt is not None:
-            doc_to_text = self.prompt
-        elif doc_to_text is not None:
-            doc_to_text = doc_to_text
-        else:
-            doc_to_text = self.config.doc_to_text
-        if isinstance(doc_to_text, int):
-            return doc_to_text
-        elif isinstance(doc_to_text, str):
-            if doc_to_text in self.features:
-                # if self.config.doc_to_choice is not None:
-                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
-                # else:
-                return doc[doc_to_text]
-            else:
-                text_string = utils.apply_template(doc_to_text, doc)
-                if text_string.isdigit() and self._config.doc_to_choice is not None:
-                    return ast.literal_eval(text_string)
-                else:
-                    return text_string
-        elif callable(doc_to_text):
-            return doc_to_text(doc)
-        # Used when applying a Promptsource template
-        elif hasattr(doc_to_text, "apply"):
-            applied_prompt = doc_to_text.apply(doc)
-            if len(applied_prompt) == 2:
-                return applied_prompt[0]
-            else:
-                eval_logger.warning("Applied prompt returns empty string")
-                return self.config.fewshot_delimiter
-        else:
-            print(type(doc_to_text))
-            raise TypeError
-    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
-        if self.prompt is not None:
-            doc_to_target = self.prompt
-        elif doc_to_target is not None:
-            doc_to_target = doc_to_target
-        else:
-            doc_to_target = self.config.doc_to_target
-        if isinstance(doc_to_target, int):
-            return doc_to_target
-        elif isinstance(doc_to_target, str):
-            if doc_to_target in self.features:
-                # if self.config.doc_to_choice is not None:
-                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
-                # else:
-                return doc[doc_to_target]
-            else:
-                target_string = utils.apply_template(doc_to_target, doc)
-                if target_string.isdigit() and self._config.doc_to_choice is not None:
-                    return ast.literal_eval(target_string)
-                elif (
-                    len(target_string) >= 2
-                    and (target_string[0] == "[")
-                    and (target_string[-1] == "]")
-                ):
-                    try:
-                        return ast.literal_eval(target_string)
-                    except (SyntaxError, ValueError):
-                        return target_string
-                else:
-                    return target_string
-        elif isinstance(doc_to_target, list):
-            return doc_to_target
-        elif callable(doc_to_target):
-            return doc_to_target(doc)
-        # Used when applying a Promptsource template
-        elif hasattr(doc_to_target, "apply"):
-            applied_prompt = doc_to_target.apply(doc)
-            if len(applied_prompt) == 2:
-                return applied_prompt[1]
-            else:
-                eval_logger.warning("Applied prompt returns empty string")
-                return self.config.fewshot_delimiter
-        else:
-            raise TypeError
-    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
-        if self.prompt is not None:
-            doc_to_choice = self.prompt
-        elif doc_to_choice is not None:
-            doc_to_choice = doc_to_choice
-        elif self.config.doc_to_choice is None:
-            eval_logger.error("doc_to_choice was called but not set in config")
-        else:
-            doc_to_choice = self.config.doc_to_choice
-        if isinstance(doc_to_choice, str):
-            if doc_to_choice in self.features:
-                return doc[doc_to_choice]
-            else:
-                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
-        elif isinstance(doc_to_choice, list):
-            return doc_to_choice
-        elif isinstance(doc_to_choice, dict):
-            return list(doc_to_choice.values())
-        elif callable(doc_to_choice):
-            return doc_to_choice(doc)
-        elif hasattr(doc_to_choice, "get_answer_choices_list"):
-            return doc_to_choice.get_answer_choices_list(doc)
-        else:
-            raise TypeError
-    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
-        if doc_to_image is not None:
-            doc_to_image = doc_to_image
-        elif self.config.doc_to_image is not None:
-            doc_to_image = self.config.doc_to_image
-        else:
-            return None
-        if isinstance(doc_to_image, list):
-            image_feature = [
-                self.doc_to_image(doc, feature) for feature in doc_to_image
-            ]
-            return [feature for feature in image_feature if feature is not None]
-        elif isinstance(doc_to_image, str):
-            if doc_to_image in self.features:
-                return doc[doc_to_image]
-            else:
-                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
-        elif callable(doc_to_image):
-            return doc_to_image(doc)
-        else:
-            return None
-    def doc_to_audio(self, doc: Any, doc_to_audio=None) -> Union[int, str, list]:
-        if doc_to_audio is not None:
-            doc_to_audio = doc_to_audio
-        elif self.config.doc_to_audio is not None:
-            doc_to_audio = self.config.doc_to_audio
-        else:
-            return None
-        if isinstance(doc_to_audio, list):
-            audio_feature = [
-                self.doc_to_audio(doc, feature) for feature in doc_to_audio
-            ]
-            return [feature for feature in audio_feature if feature is not None]
-        elif isinstance(doc_to_audio, str):
-            if doc_to_audio in self.features:
-                return doc[doc_to_audio]
-            else:
-                return ast.literal_eval(utils.apply_template(doc_to_audio, doc))
-        elif callable(doc_to_audio):
-            return doc_to_audio(doc)
-        else:
-            return None
-    def doc_to_prefix(self, doc):
-        if (gen_prefix := self.config.gen_prefix) is not None:
-            if gen_prefix in self.features:
-                return doc[gen_prefix]
-            else:
-                return utils.apply_template(gen_prefix, doc)
-        return None
-    def construct_requests(
-        self, doc: dict, ctx: str, **kwargs
-    ) -> Union[List[Instance], Instance]:
-        apply_chat_template = kwargs.pop("apply_chat_template", False)
-        chat_template: Callable | None = kwargs.pop("chat_template", None)
-        aux_arguments = None
-        if self.OUTPUT_TYPE == "loglikelihood":
-            arguments = (ctx, self.doc_to_target(doc))
-        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
-            arguments = (self.doc_to_target(doc),)
-        elif self.OUTPUT_TYPE == "multiple_choice":
-            choices = self.doc_to_choice(doc)
-            target_delimiter = self.config.target_delimiter
-            if apply_chat_template:
-                target_delimiter = ""
-            if self.multiple_input:
-                # If there are multiple inputs, choices are placed in the ctx
-                # apply chat_template to choices if apply_chat_template
-                cont = self.doc_to_target(doc)
-                arguments = [
-                    (
-                        ctx
-                        + (
-                            chat_template([{"role": "user", "content": choice}])
-                            if apply_chat_template
-                            else choice
-                        ),
-                        f"{target_delimiter}{cont}",
-                    )
-                    for choice in choices
-                ]
-            else:
-                # Otherwise they are placed in the continuation
-                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
-            # TODO: we should raise a warning telling users this will at most ~2x runtime.
-            if "acc_mutual_info" in self._metric_fn_list.keys():
-                # if we are calculating multiple choice accuracy
-                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
-                # here mutual info refers to calculating
-                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
-                # in other words normalizing by subtracting the unconditional logprob of each choice.
-                # TODO: should these be strided? will have to modify the processing in process_results if so
-                aux_arguments = [
-                    ("", f"{target_delimiter}{choice}") for choice in choices
-                ]
-                arguments.extend(aux_arguments)
-        elif self.OUTPUT_TYPE == "generate_until":
-            arguments = (ctx, deepcopy(self.config.generation_kwargs))
-        multimodal_arg = {}
-        if (
-            self.config.doc_to_image
-        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
-            multimodal_arg = {
-                **multimodal_arg,
-                **{"visual": self.doc_to_image(doc)},
-            }
-        if (
-            self.config.doc_to_audio
-        ):  # TODO: ensure that non-multimodal tasks aren't getting audio args
-            multimodal_arg = {
-                **multimodal_arg,
-                **{"audio": self.doc_to_audio(doc)},
-            }
-        if bool(multimodal_arg):
-            if isinstance(arguments, list):
-                arguments = [arg + (multimodal_arg,) for arg in arguments]
-            else:
-                arguments = arguments + (multimodal_arg,)
-        if self.OUTPUT_TYPE == "multiple_choice":
-            request_list = [
-                Instance(
-                    request_type="loglikelihood",
-                    doc=doc,
-                    arguments=arg,
-                    idx=i,
-                    **kwargs,
-                )
-                for i, arg in enumerate(arguments)
-            ]
-            return request_list
-        return Instance(
-            request_type=self.OUTPUT_TYPE,
-            doc=doc,
-            arguments=arguments,
-            idx=0,
-            **kwargs,
-        )
-    def process_results(self, doc, results):
-        if callable(self.config.process_results):
-            return self.config.process_results(doc, results)
-        result_dict = {}
-        use_metric = list(self._metric_fn_list.keys())
-        if self.OUTPUT_TYPE == "loglikelihood":
-            results = results[0]
-            ll, is_greedy = results
-            return {
-                **({"perplexity": ll} if "perplexity" in use_metric else {}),
-                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
-            }
-        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
-            (loglikelihood,) = results
-            _words = self.count_words(self.doc_to_target(doc))
-            _bytes = self.count_bytes(self.doc_to_target(doc))
-            return {
-                **(
-                    {"word_perplexity": (loglikelihood, _words)}
-                    if "word_perplexity" in use_metric
-                    else {}
-                ),
-                **(
-                    {"byte_perplexity": (loglikelihood, _bytes)}
-                    if "byte_perplexity" in use_metric
-                    else {}
-                ),
-                **(
-                    {"bits_per_byte": (loglikelihood, _bytes)}
-                    if "bits_per_byte" in use_metric
-                    else {}
-                ),
-            }
-        elif self.OUTPUT_TYPE == "multiple_choice":
-            lls, is_greedy = zip(*results)
-            # retrieve choices in List[str] form, to compute choice lengths, etc.
-            choices = self.doc_to_choice(doc)
-            completion_len = np.array([float(len(i)) for i in choices])
-            if (
-                2 * len(choices) == len(lls)
-                and "acc_mutual_info" in self._metric_fn_list.keys()
-            ):
-                # then we are doing mutual info.
-                # this stores the "dryrun" / unconditional answer loglikelihoods
-                # as we extend the args list with unconditional ("", continuation) pairs
-                lls_unconditional = lls[len(choices) :]
-                if len(lls_unconditional) != len(choices):
-                    raise ValueError
-                # and this stores our "regular" conditional loglikelihoods
-                lls = lls[: len(choices)]
-            pred = np.argmax(lls)
-            pred_norm = np.argmax(lls / completion_len)
-            if self.multiple_input:
-                gold = self.doc_to_text(doc)
-            else:
-                gold = self.doc_to_target(doc)
-            gold_index_error = False
-            if isinstance(gold, list):
-                gold = [i if i < len(choices) else -100 for i in gold]
-                if -100 in gold:
-                    gold_index_error = True
-            else:
-                if isinstance(gold, int):
-                    gold = gold if gold < len(choices) else -100
-                elif isinstance(gold, str):
-                    gold = choices.index(gold) if gold in choices else -100
-                if gold == -100:
-                    gold_index_error = True
-            if gold_index_error:
-                eval_logger.warning(
-                    f"Label index was not in within range of available choices,"
-                    f"Sample:\n\n{doc}\n\n"
-                )
-            if self.multiple_target:
-                acc = 1.0 if pred in gold else 0.0
-                acc_norm = 1.0 if pred_norm in gold else 0.0
-                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
-            else:
-                acc = 1.0 if pred == gold else 0.0
-                acc_norm = 1.0 if pred_norm == gold else 0.0
-                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                exact_match = int(is_greedy[gold]) if gold != -100 else 0
-            prob_norm = utils.softmax(lls)
-            # TODO use keyword arguments to the metric?
-            # gold, pred, norm stuff, the original lls,
-            result_dict = {
-                **({"acc": acc} if "acc" in use_metric else {}),
-                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
-                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
-                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
-                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
-                **(
-                    {"brier_score": (gold, prob_norm)}
-                    if "brier_score" in use_metric
-                    else {}
-                ),
-            }
-            if "acc_mutual_info" in use_metric:
-                lls_mutual_info = [
-                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
-                ]
-                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
-                result_dict["acc_mutual_info"] = acc_mutual_info
-        elif self.OUTPUT_TYPE == "generate_until":
-            gold = self.doc_to_target(doc)
-            result = results[0]
-            if self.config.doc_to_choice is not None:
-                # If you set doc_to_choice,
-                # it assumes that doc_to_target returns a number.
-                choices = self.doc_to_choice(doc)
-                gold = choices[gold]
-            # we expect multiple_targets to be a list.
-            elif self.multiple_target:
-                gold = list(gold)
-            # TODO: handle this better
-            elif type(gold) is not type(result) and not (
-                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
-            ):
-                # cast gold to the same type as result
-                gold = type(result)(gold)
-            for metric in self._metric_fn_list.keys():
-                if self.multiple_target:
-                    # in the case where we have multiple targets,
-                    # return true if any are true
-                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
-                    scores = []
-                    if not isinstance(gold, list):
-                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
-                        # print(gold)
-                        gold = [gold]
-                    if metric == "exact_match":
-                        result = [result for _ in range(len(gold))]
-                        scores = self._metric_fn_list[metric](
-                            references=gold,
-                            predictions=result,
-                            **self._metric_fn_kwargs[metric],
-                        )[metric]
-                        result_score = 1.0 if scores > 0.0 else 0.0
-                    else:
-                        for gold_option in gold:
-                            try:
-                                result_score = self._metric_fn_list[metric](
-                                    references=[gold_option],
-                                    predictions=[result],
-                                    **self._metric_fn_kwargs[metric],
-                                )
-                            except (
-                                TypeError
-                            ):  # TODO: this is hacky and I don't want to do it
-                                result_score = self._metric_fn_list[metric](
-                                    [gold_option, result]
-                                )
-                            if isinstance(result_score, dict):
-                                # TODO: this handles the case where HF evaluate returns a dict.
-                                result_score = result_score[metric]
-                            scores.append(result_score)
-                        if any(scores):
-                            result_score = 1.0
-                        else:
-                            result_score = 0.0
-                else:
-                    try:
-                        result_score = self._metric_fn_list[metric](
-                            references=[gold],
-                            predictions=[result],
-                            **self._metric_fn_kwargs[metric],
-                        )
-                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
-                        result_score = self._metric_fn_list[metric]([gold, result])
-                if isinstance(result_score, dict):
-                    # TODO: this handles the case where HF evaluate returns a dict.
-                    # This allows for multiple metrics to be returned from the same function
-                    for k, v in result_score.items():
-                        result_dict[k] = v
-                else:
-                    result_dict[metric] = result_score
-        else:
-            raise ValueError(
-                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
-                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
-            )
-        return result_dict
-    def aggregation(self) -> dict:
-        return self._aggregation_list
-    def higher_is_better(self) -> dict:
-        return self._higher_is_better
-    def get_config(self, key: str) -> Any:
-        return getattr(self._config, key, None)
-    @property
-    def task_name(self) -> Any:
-        return getattr(self.config, "task", None)
-    def __repr__(self):
-        return (
-            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
-            f"output_type={self.OUTPUT_TYPE},"
-            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
-            f"num_samples={len(self.eval_docs)})"
-        )
-class MultipleChoiceTask(Task):
-    OUTPUT_TYPE = "loglikelihood"
-    def doc_to_target(self, doc: dict) -> str:
-        return " " + doc["choices"][doc["gold"]]
-    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]:
-        # TODO: add mutual info here?
-        return [
-            Instance(
-                request_type="loglikelihood",
-                doc=doc,
-                arguments=(ctx, " {}".format(choice)),
-                idx=i,
-                **kwargs,
-            )
-            for i, choice in enumerate(doc["choices"])
-        ]
-    def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict:
-        results = [
-            res[0] for res in results
-        ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
-        gold = doc["gold"]
-        acc = 1.0 if np.argmax(results) == gold else 0.0
-        completion_len = np.array([float(len(i)) for i in doc["choices"]])
-        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
-        return {
-            "acc": acc,
-            "acc_norm": acc_norm,
-        }
-    def higher_is_better(self) -> dict:
-        return {
-            "acc": True,
-            "acc_norm": True,
-        }
-    def aggregation(self) -> dict:
-        return {
-            "acc": mean,
-            "acc_norm": mean,
-        }
-class PerplexityTask(Task):
-    OUTPUT_TYPE = "loglikelihood_rolling"
-    def has_training_docs(self) -> bool:
-        return False
-    def fewshot_examples(self, k: int, rnd) -> List:
-        if k != 0:
-            raise ValueError(
-                "The number of fewshot examples must be 0 for perplexity tasks."
-            )
-        return []
-    def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]:
-        if num_fewshot != 0:
-            raise ValueError(
-                "The number of fewshot examples must be 0 for perplexity tasks."
-            )
-        return ""
-    def higher_is_better(self) -> dict:
-        return {
-            "word_perplexity": False,
-            "byte_perplexity": False,
-            "bits_per_byte": False,
-        }
-    def doc_to_decontamination_query(self, doc):
-        return doc
-    def doc_to_text(self, doc) -> str:
-        return ""
-    def doc_to_target(self, doc):
-        return doc
-    def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs):
-        if bool(ctx):
-            raise ValueError
-        return Instance(
-            request_type=self.OUTPUT_TYPE,
-            doc=doc,
-            arguments=(self.doc_to_target(doc),),
-            idx=0,
-            **kwargs,
-        )
-    def process_results(self, doc: dict, results: Tuple[float]) -> dict:
-        (loglikelihood,) = results
-        words = self.count_words(self.doc_to_target(doc))
-        bytes_ = self.count_bytes(self.doc_to_target(doc))
-        return {
-            "word_perplexity": (loglikelihood, words),
-            "byte_perplexity": (loglikelihood, bytes_),
-            "bits_per_byte": (loglikelihood, bytes_),
-        }
-    def aggregation(self) -> dict:
-        return {
-            "word_perplexity": weighted_perplexity,
-            "byte_perplexity": weighted_perplexity,
-            "bits_per_byte": bits_per_byte,
-        }
-    @classmethod
-    def count_bytes(cls, doc) -> int:
-        return len(doc.encode("utf-8"))
-    @classmethod
-    def count_words(cls, doc) -> int:
-        """Downstream tasks with custom word boundaries should override this!"""
-        return len(re.split(r"\s+", doc))

lm-evaluation-harness/lm_eval/caching/cache.py DELETED Viewed

@@ -1,59 +0,0 @@
-import hashlib
-import logging
-import os
-import dill
-eval_logger = logging.getLogger(__name__)
-MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
-OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
-PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache"
-# This should be sufficient for uniqueness
-HASH_INPUT = "EleutherAI-lm-evaluation-harness"
-HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
-FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
-def load_from_cache(file_name: str, cache: bool = False):
-    if not cache:
-        return
-    try:
-        path = f"{PATH}/{file_name}{FILE_SUFFIX}"
-        with open(path, "rb") as file:
-            cached_task_dict = dill.loads(file.read())
-            return cached_task_dict
-    except Exception:
-        eval_logger.debug(f"{file_name} is not cached, generating...")
-        pass
-def save_to_cache(file_name, obj):
-    if not os.path.exists(PATH):
-        os.mkdir(PATH)
-    file_path = f"{PATH}/{file_name}{FILE_SUFFIX}"
-    eval_logger.debug(f"Saving {file_path} to cache...")
-    with open(file_path, "wb") as file:
-        file.write(dill.dumps(obj))
-# NOTE the "key" param is to allow for flexibility
-def delete_cache(key: str = ""):
-    files = os.listdir(PATH)
-    for file in files:
-        if file.startswith(key) and file.endswith(FILE_SUFFIX):
-            file_path = f"{PATH}/{file}"
-            os.unlink(file_path)

lm-evaluation-harness/lm_eval/decontamination/__init__.py DELETED Viewed

File without changes

lm-evaluation-harness/lm_eval/decontamination/archiver.py DELETED Viewed

@@ -1,174 +0,0 @@
-import datetime
-import io
-import json
-import mmap
-import os
-from pathlib import Path
-from typing import Any
-import jsonlines
-import tqdm
-import zstandard
-def json_serial(obj: Any) -> str:
-    """JSON serializer for objects not serializable by default json code"""
-    if isinstance(obj, (datetime.datetime,)):
-        return obj.isoformat()
-    raise TypeError("Type %s not serializable" % type(obj))
-# Modified version of lm_dataformat Archive for single file.
-class Archive:
-    def __init__(self, file_path: str, compression_level: int = 3) -> None:
-        self.file_path = file_path
-        dir_name = os.path.dirname(file_path)
-        if dir_name:
-            os.makedirs(dir_name, exist_ok=True)
-        self.fh = open(self.file_path, "wb")
-        self.cctx = zstandard.ZstdCompressor(level=compression_level)
-        self.compressor = self.cctx.stream_writer(self.fh)
-    def add_data(self, data, meta=None) -> None:
-        if meta is None:
-            meta = {}
-        self.compressor.write(
-            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
-                "UTF-8"
-            )
-            + b"\n"
-        )
-    def commit(self) -> None:
-        self.compressor.flush(zstandard.FLUSH_FRAME)
-        self.fh.flush()
-        self.fh.close()
-# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
-class Reader:
-    def __init__(self) -> None:
-        pass
-    def read(
-        self,
-        file,
-        get_meta: bool = False,
-        autojoin_paragraphs: bool = True,
-        para_joiner: str = "\n\n",
-    ):
-        with open(file, "rb") as fh:
-            self.fh = fh
-            cctx = zstandard.ZstdDecompressor()
-            reader = io.BufferedReader(cctx.stream_reader(fh))
-            rdr = jsonlines.Reader(reader)
-            for ob in rdr:
-                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
-                if isinstance(ob, str):
-                    assert not get_meta
-                    yield ob
-                    continue
-                text = ob["text"]
-                if autojoin_paragraphs and isinstance(text, list):
-                    text = para_joiner.join(text)
-                if get_meta:
-                    yield text, (ob["meta"] if "meta" in ob else {})
-                else:
-                    yield text
-class TextArchive:
-    def __init__(self, file_path, mode: str = "rb+") -> None:
-        self.file_path = file_path
-        dir_name = os.path.dirname(file_path)
-        if dir_name:
-            os.makedirs(dir_name, exist_ok=True)
-        if not os.path.exists(file_path):
-            Path(file_path).touch()
-        self.fh = open(self.file_path, mode)
-    def add_data(self, data) -> None:
-        self.fh.write(data.encode("UTF-8") + b"\n")
-    def commit(self) -> None:
-        self.fh.flush()
-        self.fh.close()
-class TextReader:
-    def __init__(self, file_path) -> None:
-        self.file_path = file_path
-    # Optimized mmap read with infrequent tqdm updates to maintain speed
-    # Tested up to 250MB/s.
-    def read_tqdm(self, update_frequency: int = 10000):
-        current_file_position = 0
-        line_counter = 0
-        with (
-            open(self.file_path, "r", encoding="utf-8") as fh,
-            tqdm.tqdm(
-                total=os.path.getsize(self.file_path),
-                dynamic_ncols=True,
-                unit="byte",
-                unit_scale=1,
-            ) as progress,
-        ):
-            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
-                for line in iter(mmap_obj.readline, b""):
-                    line = line.decode("utf-8")
-                    line_counter += 1
-                    if line_counter == update_frequency:
-                        new_file_pos = mmap_obj.tell()
-                        bytes_read = new_file_pos - current_file_position
-                        current_file_position = new_file_pos
-                        progress.update(bytes_read)
-                        line_counter = 0
-                    yield line[:-1]
-    def read_and_tell(self):
-        current_file_position = 0
-        with open(self.file_path, "r", encoding="utf8") as fh:
-            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
-                for line in iter(mmap_obj.readline, b""):
-                    line = line.decode("utf-8")
-                    new_file_pos = mmap_obj.tell()
-                    raw_bytes_read = new_file_pos - current_file_position
-                    current_file_position = new_file_pos
-                    yield line[:-1], raw_bytes_read
-    def read(self):
-        with open(self.file_path, "r", encoding="utf8") as fh:
-            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
-                for line in iter(mmap_obj.readline, b""):
-                    line = line.decode("utf-8")
-                    yield line[:-1]
-    def read_slow(self):
-        with open(self.file_path, "r", encoding="utf8") as fh:
-            while True:
-                line = fh.readline()
-                if line == -1 or line == "":
-                    break
-                else:
-                    yield line[:-1]
-# Optimized for speed. Decompresses the archive in shell before
-# using the mmap'd TextReader.
-class ZStdTextReader:
-    def __init__(self, file) -> None:
-        self.file = file
-    def read_tqdm(self):
-        decompressed_file = self.file[:-4]
-        print("Decompressing file, please wait...")
-        os.system(f"zstd -d {self.file}")  # linux decompress is faster
-        reader = TextReader(decompressed_file)
-        yield from reader.read_tqdm()
-        os.remove(decompressed_file)

lm-evaluation-harness/lm_eval/decontamination/decontaminate.py DELETED Viewed

@@ -1,166 +0,0 @@
-import collections
-import glob
-import json
-import os
-import pickle
-import random
-import time
-from .archiver import ZStdTextReader
-from .janitor import Janitor, word_ngrams
-# Was used for testing the evaluator decoupled from the full logic below
-def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
-    simulated_overlap = 0.1
-    contaminated = int(len(docs) * simulated_overlap)
-    return random.sample(range(len(docs)), contaminated)
-# Returns a dictionary containing all overlapping documents in each
-# task. In the standard use case, an overlap occurs when any of the 13-grams
-# found in the task document exist in the training set documents.
-#
-# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these
-# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
-# files. These should exist in the "ngrams_path" provided to this function.
-# Algorithm:
-# 1. Build lookups for each dataset {ngram: list(document_ids)}
-# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
-# 3. Full scan the 13-grams from the training set against the merged lookup,
-#    saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)}
-# 4. Strip the task_set from the dictionary keys and return
-#
-# We cache the task+set lookups as well as the overlaps.
-def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
-    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
-    info_dict_path = os.path.join(ngrams_path, "info.json")
-    info_dict = json.load(open(info_dict_path, "r", encoding="utf-8"))
-    ngrams_n_size = info_dict["ngram_size"]
-    janitor = Janitor()
-    # Build lookup for each dataset first in case we use different task combinations later
-    print("Building Lookups...")
-    start = time.perf_counter()
-    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
-        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
-    lookups = {}
-    duplicates = {}  # (task_name, task_set): set(doc_ids)}
-    sets_to_decontaminate = len(docs_by_task_set.keys())
-    for (task_name, task_set), docs in docs_by_task_set.items():
-        if not os.path.exists(f"data/{task_name}"):
-            os.mkdir(f"data/{task_name}")
-        # Check if we've decontaminated this combination before
-        overlaps_dump_path = get_overlaps_dump_path(
-            task_name, task_set, ngrams_n_size, limit
-        )
-        if os.path.exists(overlaps_dump_path):
-            duplicates[(task_name, task_set)] = pickle.load(
-                open(overlaps_dump_path, "rb")
-            )
-            sets_to_decontaminate -= 1
-            continue
-        else:
-            duplicates[(task_name, task_set)] = set()
-        # Build/load the task lookup {ngram: set(documents)}.
-        task_set_lookup_path = (
-            f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
-        )
-        if os.path.exists(task_set_lookup_path):
-            print(f"{task_set_lookup_path} available, loading...")
-            lookups[(task_name, task_set)] = pickle.load(
-                open(task_set_lookup_path, "rb")
-            )
-        else:
-            print(f"{task_set_lookup_path} not available, building...")
-            lookup = collections.defaultdict(set)
-            for doc_id, document in enumerate(docs):
-                ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size)
-                for ngram in ngrams:
-                    lookup[ngram].add(doc_id)
-            pickle.dump(lookup, open(task_set_lookup_path, "wb"))
-            lookups[(task_name, task_set)] = lookup
-    elapsed = time.perf_counter() - start
-    print(f"Building lookups took {elapsed:0.5f} seconds.")
-    matched_ngrams = []
-    if sets_to_decontaminate > 0:
-        print("Merging lookups...")
-        start = time.perf_counter()
-        merged_lookup = collections.defaultdict(list)
-        for (task_name, task_set), lookup in lookups.items():
-            for ngram, doc_ids in lookup.items():
-                merged_lookup[ngram].append((task_name, task_set, doc_ids))
-        elapsed = time.perf_counter() - start
-        print(f"Merging lookups took {elapsed:0.5f} seconds.")
-        print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
-        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
-        print(files)
-        for file in files:
-            start = time.perf_counter()
-            print(f"Scanning {file}")
-            reader = ZStdTextReader(file)
-            total_ngrams = 0
-            unique_ngrams = 0
-            matching_unique = 0
-            non_matching_unique = 0
-            current_ngram = ""
-            for line in reader.read_tqdm():  # Scan training set ngrams file
-                total_ngrams += 1
-                [ngram, document_id] = line.rsplit(" ", 1)
-                if (
-                    ngram != current_ngram
-                ):  # Only need to match the ngram once in training set
-                    unique_ngrams += 1
-                    current_ngram = ngram
-                    if ngram in merged_lookup:
-                        matched_ngrams.append(ngram)  # For logging
-                        matching_unique += 1
-                        for task_name, task_set, doc_ids in merged_lookup[ngram]:
-                            task_doc_set = duplicates[(task_name, task_set)]
-                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
-                                task_doc_set.add(doc_id)
-                        del merged_lookup[ngram]  # No point matching again
-                    else:
-                        non_matching_unique += 1
-            print(f"Total Ngrams: {total_ngrams}")
-            print(f"Unique Ngrams: {unique_ngrams}")
-            print(f"Unique Matching: {matching_unique}")
-            print(f"Unique Non Matching: {non_matching_unique}")
-            print("Matched ngrams:")
-            for ngram in matched_ngrams:
-                print(ngram)
-            elapsed = time.perf_counter() - start
-            print(f"Read took {elapsed:0.5f} seconds.")
-            print(f"Speed: {(os.path.getsize(file) / 1000000.0) / elapsed}MB/second")
-        print(duplicates)
-        # Dump overlaps separately
-        for (task_name, task_set), doc_ids in duplicates.items():
-            overlaps_dump_path = get_overlaps_dump_path(
-                task_name, task_set, ngrams_n_size, limit
-            )
-            pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
-    # Strip task set and return
-    return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}

lm-evaluation-harness/lm_eval/decontamination/janitor.py DELETED Viewed

@@ -1,328 +0,0 @@
-import pickle
-import re
-import string
-import traceback
-from typing import Iterator, List, Sequence, Tuple, TypeVar
-# This is a cpp module. Compile janitor_util.cpp with:
-# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
-try:
-    import janitor_util
-    JANITOR_CPP = True
-except Exception:
-    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
-    traceback.print_exc()
-    JANITOR_CPP = False
-T = TypeVar("T")
-# Implementation from nltk source
-# https://www.nltk.org/_modules/nltk/util.html
-def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
-    history = []
-    while n > 1:
-        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
-        try:
-            next_item = next(sequence)
-        except StopIteration:
-            # no more data, terminate the generator
-            return
-        history.append(next_item)
-        n -= 1
-    for item in sequence:
-        history.append(item)
-        yield tuple(history)
-        del history[0]
-def word_ngrams(s: str, n: int) -> Iterator[str]:
-    """Splits a string into ngram words"""
-    tokens = s.split()  # not a generator :(
-    ngram_seqs = form_ngrams(iter(tokens), n)
-    return (" ".join(ngram) for ngram in ngram_seqs)
-# Does character sequences only - combined faster function to play around with later
-# def word_ngrams_indices_combined(sequence, n):
-#     current_word = ""
-#     history = []
-#     gap = False;
-#     start = 0
-#     end = 0
-#     for character in sequence:
-#         if character == " ":
-#             if not gap:
-#                 gap = True
-#                 history.append(current_word)
-#                 end += len(current_word) - 1
-#                 current_word = ""
-#                 if len(history) == n:
-#                     yield (tuple(history), start, end)
-#                     del history[0]
-#                     start = end + 1
-#                     end = start
-#         else:
-#             gap = False
-#             current_word += character
-# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
-def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
-    """Splits a string on whitespaces and records the indices of each in the original string.
-    @:return generator((word, (start_idx, end_idx)), ...)
-    """
-    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
-def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
-    """Splits a string into pairs of (ngram words, their start/end indices)"""
-    tokens_with_indices = split_indices(s)
-    # Generator of ngrams of (word, idx_pairs)
-    # (
-    #   [(word, (start,end)), (word, (start, end))...],
-    #   [(word, (start, end)), ...],
-    #   ...
-    # )
-    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)
-    # Generator of pairs of word and index ngrams
-    # (
-    #   ([word, word, ...], [(start,end), (start,end), ...]),
-    #   ...
-    # )
-    ngram_indices_pairs = (
-        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
-    )
-    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
-    return (
-        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
-        for ngram_seq, indices in ngram_indices_pairs
-    )
-class Janitor:
-    # FIXME delete_chars: Should anything else go here? Special chars?
-    def __init__(
-        self,
-        ngram_n: int = 13,
-        window_to_remove: int = 200,
-        too_dirty_cutoff: int = 10,
-        minimum_slice_length: int = 200,
-        delete_chars: str = string.punctuation,
-    ) -> None:
-        self.ngram_n = ngram_n
-        self.window_to_remove = window_to_remove
-        self.too_dirty_cutoff = too_dirty_cutoff
-        self.minimum_slice_length = minimum_slice_length
-        self.delete_chars = delete_chars
-        self.dirt_ngrams = set()
-        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
-        # This is fast by python standards
-        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
-        self.translation_table = str.maketrans(
-            string.ascii_lowercase + string.ascii_uppercase,  # These characters
-            string.ascii_lowercase * 2,  # Become these characters
-            self.delete_chars,  # These are deleted
-        )
-    ##############
-    # I/O for saving contamination ngrams
-    ##############
-    def save_contamination_ngrams(self, filename: str) -> None:
-        with open(filename, "wb") as fp:
-            pickle.dump(filename, fp)
-    def load_contamination_ngrams(self, filename: str) -> None:
-        with open(filename, "rb") as fp:
-            self.dirt_ngrams = pickle.load(fp)
-    ##############
-    # Call these :)
-    ##############
-    def register_contaminant(self, dirt_string: str) -> None:
-        """Register a string as contamination to be removed, e.g. a test set
-        This breaks the dirt_string into ngrams to store for future cleaning"""
-        if JANITOR_CPP:
-            return self.register_contaminant_cpp(dirt_string)
-        else:
-            print("WARNING: Janitor running in python mode")
-            return self.register_contaminant_python(dirt_string)
-    def clean(self, dirty_string: str) -> List[str]:
-        """Clean a string (e.g. a training set) by removing all ngrams previously
-        registered as contaminants. Returns a list of clean chunks, or empty if
-        the string was too dirty"""
-        if JANITOR_CPP:
-            return self.clean_cpp(dirty_string)
-        else:
-            print("WARNING: Janitor running in python mode")
-            return self.clean_python(dirty_string)
-    def _split_chunks(
-        self, dirty_string: str, dirty_parts: Sequence[Tuple]
-    ) -> List[str]:
-        clean_chunks = []
-        splice_idx = 0
-        end = -1
-        for i, (ngram, start, end) in enumerate(dirty_parts):
-            if i >= self.too_dirty_cutoff:
-                return []
-            start = max(0, start - self.window_to_remove)
-            end = min(len(dirty_string), end + self.window_to_remove)
-            if start - splice_idx > self.minimum_slice_length:
-                clean_chunks.append(dirty_string[splice_idx:start])
-            splice_idx = end
-        if end < len(dirty_string) - self.minimum_slice_length:
-            clean_chunks.append(dirty_string[end + 1 :])
-        return clean_chunks
-    ##############
-    # Fast C++
-    ##############
-    def register_contaminant_cpp(self, dirt_string) -> None:
-        self.dirt_ngrams.update(
-            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
-        )
-    def clean_cpp(self, dirty_string: str) -> List[str]:
-        contamination_indices = janitor_util.clean_ngram_with_indices(
-            dirty_string, self.delete_chars, self.ngram_n
-        )
-        return self._split_chunks(dirty_string, contamination_indices)
-    ##############
-    # Slow python
-    ##############
-    def normalize_string(self, s: str) -> str:
-        return s.translate(self.translation_table)
-    def register_contaminant_python(self, dirt_string: str) -> None:
-        self.dirt_ngrams.update(
-            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
-        )
-    def clean_python(self, dirty_string: str) -> List[str]:
-        contamination_indices = (
-            (None, *idx_pair)
-            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
-            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
-        )
-        return self._split_chunks(dirty_string, contamination_indices)
-##################################################################
-# Tests
-#################################################################
-# def print_cpp():
-#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-#     for i in range(1, 10, 2):
-#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
-#         for ngram, start, end in \
-#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
-#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
-# def test_cpp():
-#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-#     contaminant = "dirty boy. Clean he he"
-#     jan_python = Janitor()
-#     jan_cpp = Janitor()
-#     jan_python.register_contaminant_python(contaminant)
-#     jan_cpp.register_contaminant(contaminant)
-#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
-#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
-#         (jan_python.clean_python(source), jan_cpp.clean(source))
-#     print("Passed test, python==cpp")
-# def benchmark():
-#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
-#     setup = \
-#         """
-#         with open("data/enwik8", "r") as f:
-#             data = f.read()
-#         jan = Janitor(too_dirty_cutoff=1000)
-#         jan.register_contaminant('''
-#         theories is that there is a connection between &quot;geekdom&quot; and autism.
-#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
-#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
-#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
-#         the media's application of mental disease labels to what is actually variant normal behavior
-#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
-#         interests, even when they seem unusual to others, are not in themselves signs of autism or
-#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
-#         mental disease labels to children who in the past would have simply been accepted as a little
-#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
-#         Due to the recent publicity surrounding autism and autis
-#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
-#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
-#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
-#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
-#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
-#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
-#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
-#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
-#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
-#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
-#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
-#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
-#         ''')
-#         """
-#     n = 1
-#     print(f"Timing {n} run on 100 MB")
-#     print("Register contaminant")
-#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
-#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
-#     print("Clean")
-#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
-#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
-# def test_janitor_general():
-#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
-#     contaminant = "dirty boy. Clean he he"
-#     jan = Janitor(ngram_n=3)
-#     jan.register_contaminant(contaminant)
-#     cleaned = " ".join(jan.clean(source))
-#     for contam in jan.dirt_ngrams:
-#         assert contam not in cleaned, contam
-#     filename = "data/saved_contam"
-#     jan.save_contamination_ngrams(filename)
-#     jan = Janitor(ngram_n=3)
-#     jan.load_contamination_ngrams(filename)
-#     cleaned = " ".join(jan.clean(source))
-#     for contam in jan.dirt_ngrams:
-#         assert contam not in cleaned, contam
-# if __name__ == "__main__":
-#     test()
-#     # print_cpp()
-#     # test_cpp()
-#     # benchmark()

lm-evaluation-harness/lm_eval/evaluator.py DELETED Viewed

@@ -1,761 +0,0 @@
-import itertools
-import json
-import logging
-import random
-import time
-from collections import defaultdict
-from typing import TYPE_CHECKING, List, Optional, Union
-import numpy as np
-import torch
-import lm_eval.api.metrics
-import lm_eval.api.registry
-import lm_eval.api.task
-import lm_eval.models
-from lm_eval.caching.cache import delete_cache
-from lm_eval.evaluator_utils import (
-    consolidate_group_results,
-    consolidate_results,
-    get_sample_size,
-    get_subtask_list,
-    get_task_list,
-    prepare_print_tasks,
-    print_writeout,
-    run_task_tests,
-)
-from lm_eval.loggers import EvaluationTracker
-from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
-from lm_eval.tasks import TaskManager, get_task_dict
-from lm_eval.utils import (
-    handle_non_serializable,
-    hash_string,
-    positional_deprecated,
-    setup_logging,
-    simple_parse_args_string,
-)
-if TYPE_CHECKING:
-    from lm_eval.api.model import LM
-    from lm_eval.api.task import Task
-eval_logger = logging.getLogger(__name__)
-@positional_deprecated
-def simple_evaluate(
-    model,
-    model_args: Optional[Union[str, dict]] = None,
-    tasks: Optional[List[Union[str, dict, object]]] = None,
-    num_fewshot: Optional[int] = None,
-    batch_size: Optional[Union[int, str]] = None,
-    max_batch_size: Optional[int] = None,
-    device: Optional[str] = None,
-    use_cache: Optional[str] = None,
-    cache_requests: bool = False,
-    rewrite_requests_cache: bool = False,
-    delete_requests_cache: bool = False,
-    limit: Optional[Union[int, float]] = None,
-    samples: Optional[dict] = None,
-    bootstrap_iters: int = 100000,
-    check_integrity: bool = False,
-    write_out: bool = False,
-    log_samples: bool = True,
-    evaluation_tracker: Optional[EvaluationTracker] = None,
-    system_instruction: Optional[str] = None,
-    apply_chat_template: Union[bool, str] = False,
-    fewshot_as_multiturn: bool = False,
-    gen_kwargs: Union[str, dict, None] = None,
-    task_manager: Optional[TaskManager] = None,
-    verbosity=None,
-    predict_only: bool = False,
-    random_seed: int = 0,
-    numpy_random_seed: int = 1234,
-    torch_random_seed: int = 1234,
-    fewshot_random_seed: int = 1234,
-    confirm_run_unsafe_code: bool = False,
-    metadata: Optional[dict] = None,
-):
-    """Instantiate and evaluate a model on a list of tasks.
-    :param model: Union[str, LM]
-        Name of model or LM object, see lm_eval.models.get_model
-    :param model_args: Optional[str, dict]
-        String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object.
-        Ignored if `model` argument is a LM object.
-    :param tasks: list[Union[str, dict, Task]]
-        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
-    :param num_fewshot: int
-        Number of examples in few-shot context
-    :param batch_size: int or str, optional
-        Batch size for model
-    :param max_batch_size: int, optional
-        Maximal batch size to try with automatic batch size detection
-    :param device: str, optional
-        PyTorch device (e.g. "cpu" or "cuda:0") for running models
-    :param use_cache: str, optional
-        A path to a sqlite db file for caching model responses. `None` if not caching.
-    :param cache_requests: bool, optional
-        Speed up evaluation by caching the building of dataset requests. `None` if not caching.
-    :param rewrite_requests_cache: bool, optional
-        Rewrites all the request cache if set to `True`. `None` if not desired.
-    :param delete_requests_cache: bool, optional
-        Deletes all the request cache if set to `True`. `None` if not desired.
-    :param limit: int or float, optional
-        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
-    :param samples: dictionary, optional
-        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
-    :param bootstrap_iters:
-        Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed.
-    :param check_integrity: bool
-        Whether to run the relevant part of the test suite for the tasks
-    :param write_out: bool
-        If True, write out an example document and model input for checking task integrity
-    :param log_samples: bool
-        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
-    :param system_instruction: str
-        System instruction to be applied to the prompt
-    :param apply_chat_template: Union[bool, str]
-        Specifies whether to apply a chat template to the prompt.
-        - If set to True, the default chat template is applied.
-        - If set to a string, applies the specified chat template by name.
-        Defaults to False (no chat template applied).
-    :param fewshot_as_multiturn: bool
-        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-    :param gen_kwargs: dict or comma-separated string
-        Arguments for model generation
-        Ignored for all tasks with loglikelihood output_type
-    :param verbosity: str
-        Verbosity level for logging
-    :param predict_only: bool
-        If true only model outputs will be generated and returned. Metrics will not be evaluated
-    :param random_seed: int
-        Random seed for python's random module. If set to None, the seed will not be set.
-    :param numpy_random_seed: int
-        Random seed for numpy. If set to None, the seed will not be set.
-    :param torch_random_seed: int
-        Random seed for torch. If set to None, the seed will not be set.
-    :param fewshot_random_seed: int
-        Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
-    :param metadata: dict
-        Additional metadata to be added to the task manager. Will get passed to the download function of the task.
-    return
-        Dictionary of results
-    """
-    if verbosity is not None:
-        setup_logging(verbosity=verbosity)
-    start_date = time.time()
-    if limit is not None and samples is not None:
-        raise ValueError(
-            "Either 'limit' or 'samples' must be None, but both are not None."
-        )
-    if isinstance(model_args, str) and (
-        "instruct" in model_args and not apply_chat_template
-    ):
-        eval_logger.warning(
-            "Instruct model detected, but chat template not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
-        )
-    if delete_requests_cache:
-        eval_logger.info("Deleting requests cache...")
-        delete_cache()
-    seed_message = []
-    if random_seed is not None:
-        # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412
-        seed_message.append(f"Setting random seed to {random_seed}")
-        random.seed(random_seed)
-    if numpy_random_seed is not None:
-        seed_message.append(f"Setting numpy seed to {numpy_random_seed}")
-        np.random.seed(numpy_random_seed)
-    if torch_random_seed is not None:
-        seed_message.append(f"Setting torch manual seed to {torch_random_seed}")
-        torch.manual_seed(torch_random_seed)
-    if fewshot_random_seed is not None:
-        seed_message.append(f"Setting fewshot manual seed to {fewshot_random_seed}")
-    if seed_message:
-        eval_logger.info(" | ".join(seed_message))
-    if tasks is None:
-        tasks = []
-    if len(tasks) == 0:
-        raise ValueError(
-            "No tasks specified, or no tasks found. Please verify the task names."
-        )
-    if gen_kwargs is not None:
-        if isinstance(gen_kwargs, str):
-            gen_kwargs = simple_parse_args_string(gen_kwargs)
-        eval_logger.warning(
-            f"generation_kwargs: {gen_kwargs} specified through cli, these settings will update set parameters in yaml tasks. "
-            "Ensure 'do_sample=True' for non-greedy decoding!"
-        )
-        if not gen_kwargs:
-            gen_kwargs = None
-    if isinstance(model, str):
-        if model_args is None:
-            eval_logger.warning("model_args not specified. Using defaults.")
-            model_args = ""
-        if isinstance(model_args, dict):
-            eval_logger.info(
-                f"Initializing {model} model, with arguments: {model_args}"
-            )
-            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
-                model_args,
-                {
-                    "batch_size": batch_size,
-                    "max_batch_size": max_batch_size,
-                    "device": device,
-                },
-            )
-        else:
-            eval_logger.info(
-                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
-            )
-            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
-                model_args,
-                {
-                    "batch_size": batch_size,
-                    "max_batch_size": max_batch_size,
-                    "device": device,
-                },
-            )
-    else:
-        if not isinstance(model, lm_eval.api.model.LM):
-            raise TypeError(
-                f"The value of `model` passed to simple_evaluate() was of type {type(model)}, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first."
-            )
-        eval_logger.info("Using pre-initialized model")
-        lm = model
-    if use_cache is not None:
-        eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}")
-        lm = lm_eval.api.model.CachingLM(
-            lm,
-            use_cache
-            # each rank receives a different cache db.
-            # necessary to avoid multiple writes to cache at once
-            + "_rank"
-            + str(lm.rank)
-            + ".db",
-        )
-    if task_manager is None:
-        metadata = (
-            simple_parse_args_string(model_args)
-            if isinstance(model_args, str)
-            else model_args
-            if isinstance(model_args, dict)
-            else {}
-        ) | (metadata or {})
-        task_manager = TaskManager(metadata=metadata)
-    task_dict = get_task_dict(
-        tasks,
-        task_manager,
-    )
-    # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
-    # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
-    def _adjust_config(task_dict):
-        adjusted_task_dict = {}
-        for task_name, task_obj in task_dict.items():
-            if isinstance(task_obj, dict):
-                adjusted_task_dict = {
-                    **adjusted_task_dict,
-                    **{task_name: _adjust_config(task_obj)},
-                }
-            else:
-                if task_obj.get_config("output_type") == "generate_until":
-                    if gen_kwargs is not None:
-                        task_obj.set_config(
-                            key="generation_kwargs", value=gen_kwargs, update=True
-                        )
-                    eval_logger.info(
-                        f"{task_obj.config.task}: Using gen_kwargs: {task_obj.config.generation_kwargs}"
-                    )
-                if predict_only:
-                    eval_logger.info(
-                        f"Processing {task_name} in output-only mode. Metrics will not be calculated!"
-                    )
-                    # we have to change the class properties post-hoc. This is pretty hacky.
-                    task_obj.override_metric(metric_name="bypass")
-                # override tasks' fewshot values to the provided num_fewshot arg value
-                # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
-                if num_fewshot is not None:
-                    if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
-                        eval_logger.info(
-                            f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
-                        )
-                    else:
-                        eval_logger.warning(
-                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
-                        )
-                        task_obj.set_config(key="num_fewshot", value=num_fewshot)
-                else:
-                    # if num_fewshot not provided, and the task does not define a default one, default to 0
-                    if (
-                        default_num_fewshot := task_obj.get_config("num_fewshot")
-                    ) is None:
-                        task_obj.set_config(key="num_fewshot", value=0)
-                # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file)
-                task_obj.set_fewshot_seed(seed=fewshot_random_seed)
-                adjusted_task_dict[task_name] = task_obj
-        return adjusted_task_dict
-    task_dict = _adjust_config(task_dict)
-    if check_integrity:
-        run_task_tests(task_list=tasks)
-    if evaluation_tracker is not None:
-        evaluation_tracker.general_config_tracker.log_experiment_args(
-            model_source=model,
-            model_args=model_args,
-            system_instruction=system_instruction,
-            chat_template=lm.chat_template(apply_chat_template)
-            if apply_chat_template
-            else None,
-            fewshot_as_multiturn=fewshot_as_multiturn,
-        )
-    results = evaluate(
-        lm=lm,
-        task_dict=task_dict,
-        limit=limit,
-        samples=samples,
-        cache_requests=cache_requests,
-        rewrite_requests_cache=rewrite_requests_cache,
-        bootstrap_iters=bootstrap_iters,
-        write_out=write_out,
-        log_samples=True if predict_only else log_samples,
-        system_instruction=system_instruction,
-        apply_chat_template=apply_chat_template,
-        fewshot_as_multiturn=fewshot_as_multiturn,
-        verbosity=verbosity,
-        confirm_run_unsafe_code=confirm_run_unsafe_code,
-    )
-    if verbosity is not None:
-        setup_logging(verbosity=verbosity)
-    if lm.rank == 0:
-        if isinstance(model, str):
-            model_name = model
-        elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
-            model_name = model.config._name_or_path
-        else:
-            model_name = type(model).__name__
-        # add info about the model and few shot config
-        results["config"] = {
-            "model": model_name,
-            "model_args": model_args,
-        }
-        # add more detailed model info if available
-        if isinstance(lm, lm_eval.models.huggingface.HFLM):
-            results["config"].update(lm.get_model_info())
-        # add info about execution
-        results["config"].update(
-            {
-                "batch_size": batch_size,
-                "batch_sizes": (
-                    list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
-                ),
-                "device": device,
-                "use_cache": use_cache,
-                "limit": limit,
-                "bootstrap_iters": bootstrap_iters,
-                "gen_kwargs": gen_kwargs,
-                "random_seed": random_seed,
-                "numpy_seed": numpy_random_seed,
-                "torch_seed": torch_random_seed,
-                "fewshot_seed": fewshot_random_seed,
-            }
-        )
-        results["git_hash"] = get_git_commit_hash()
-        results["date"] = start_date
-        add_env_info(results)  # additional environment info to results
-        add_tokenizer_info(results, lm)  # additional info about tokenizer
-        return results
-    else:
-        return None
-@positional_deprecated
-def evaluate(
-    lm: "LM",
-    task_dict,
-    limit: Optional[int] = None,
-    samples: Optional[dict] = None,
-    cache_requests: bool = False,
-    rewrite_requests_cache: bool = False,
-    bootstrap_iters: Optional[int] = 100000,
-    write_out: bool = False,
-    log_samples: bool = True,
-    system_instruction: Optional[str] = None,
-    apply_chat_template: Union[bool, str] = False,
-    fewshot_as_multiturn: bool = False,
-    verbosity: str = "INFO",
-    confirm_run_unsafe_code: bool = False,
-):
-    """Instantiate and evaluate a model on a list of tasks.
-    :param lm: obj
-        Language Model
-    :param task_dict: dict[str, Task]
-        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
-    :param limit: int, optional
-        Limit the number of examples per task (only use this for testing)
-    :param samples: dictionary, optional
-        Dictionary indicating which examples should be tested in each task, e.g., {"mmlu_astronomy":[0,3,6],"mmlu_anatomy":[1,4,7,10]}.
-    :param cache_requests: bool, optional
-        Speed up evaluation by caching the building of dataset requests.
-    :param rewrite_requests_cache: bool, optional
-        Rewrites all the request cache if set to `True`.
-    :param bootstrap_iters:
-        Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations.
-    :param write_out: bool
-        If True, write out an example document and model input for checking task integrity
-    :param log_samples: bool
-        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
-    :param system_instruction: str
-        System instruction to be applied to the prompt
-    :param apply_chat_template: Union[bool, str]
-        Specifies whether to apply a chat template to the prompt.
-        - If set to True, the default chat template is applied.
-        - If set to a string, applies the specified chat template by name.
-        Defaults to False (no chat template applied).
-    :param fewshot_as_multiturn: bool
-        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
-    :param verbosity: str
-        Verbosity level for logging
-    :param confirm_run_unsafe_code: bool
-        Whether to confirm running tasks marked as unsafe.
-    :return
-        Dictionary of results
-    """
-    if limit is not None and samples is not None:
-        raise ValueError(
-            "Either 'limit' or 'samples' must be None, but both are not None."
-        )
-    if samples is not None:
-        eval_logger.info(f"Evaluating examples for tasks {list(samples.keys())}")
-    if apply_chat_template:
-        eval_logger.warning(
-            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
-        )
-    # tracks all Instances/requests a model must generate output on.
-    requests = defaultdict(list)
-    # stores the amount to pad out reqs per req. type so that
-    # number of fwd passes per distributed rank is equal
-    padding_requests = defaultdict(int)
-    # get lists of group hierarchy and each type of request
-    eval_tasks = get_task_list(task_dict)
-    if not log_samples:
-        if not all(
-            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
-            for task_output in eval_tasks
-        ):
-            raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
-    # validation checks:
-    # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
-    # 2.are we running code that is marked as unsafe.
-    incompatible_tasks = []
-    for task_output in eval_tasks:
-        task: Task = task_output.task
-        if getattr(task, "MULTIMODAL", False) and not getattr(lm, "MULTIMODAL", False):
-            incompatible_tasks.append(task_output.task_name)
-        elif getattr(task, "UNSAFE_CODE", False) and not confirm_run_unsafe_code:
-            raise ValueError(
-                f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
-            )
-    if len(incompatible_tasks) > 0:
-        if not getattr(lm, "MULTIMODAL", False):
-            raise ValueError(
-                f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
-            )
-    # end validation check
-    # Cache the limit arg.
-    limit_arg = limit
-    limits = []
-    for task_output in eval_tasks:
-        task: Task = task_output.task
-        limit = get_sample_size(task, limit_arg)
-        limits.append(limit)
-        task.build_all_requests(
-            limit=limit,
-            samples=samples.get(task_output.task_name, None)
-            if samples is not None
-            else samples,
-            rank=lm.rank,
-            world_size=lm.world_size,
-            cache_requests=cache_requests,
-            rewrite_requests_cache=rewrite_requests_cache,
-            system_instruction=system_instruction,
-            apply_chat_template=bool(apply_chat_template),
-            fewshot_as_multiturn=fewshot_as_multiturn,
-            chat_template=getattr(lm, "apply_chat_template")
-            if apply_chat_template
-            else None,
-            tokenizer_name=getattr(lm, "tokenizer_name", "")
-            if apply_chat_template
-            else "",
-        )
-        eval_logger.debug(
-            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
-        )
-        if write_out:
-            print_writeout(task)
-        # aggregate Instances by LM method requested to get output.
-        for instance in task.instances:
-            reqtype = instance.request_type
-            requests[reqtype].append(instance)
-        if lm.world_size > 1:
-            instances_rnk = torch.tensor(len(task._instances), device=lm.device)
-            gathered_item = (
-                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
-            )
-            # "multiple_choice" task types dispatch (several) "loglikelihood" request types
-            reqtype = (
-                "loglikelihood"
-                if task.OUTPUT_TYPE == "multiple_choice"
-                else task.OUTPUT_TYPE
-            )
-            # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
-            numpad = max(gathered_item) - gathered_item[lm.rank]
-            # todo: may not account for padding in cases like SquadV2 which has multiple req types
-            padding_requests[reqtype] += numpad
-    ### Run LM on inputs, get all outputs ###
-    # execute each type of request
-    for reqtype, reqs in requests.items():
-        eval_logger.info(f"Running {reqtype} requests")
-        # create `K` copies of each request `req` based off `K = req.repeats`
-        cloned_reqs = []
-        for req in reqs:
-            cloned_reqs.extend([req] * req.repeats)
-        if (lm.world_size > 1) and (padding_requests[reqtype] > 0):
-            for _ in range(padding_requests[reqtype]):
-                cloned_reqs.extend([req] * req.repeats)
-        # run requests through model
-        resps = getattr(lm, reqtype)(cloned_reqs)
-        # put responses from model into a list of length K for each request.
-        for x, req in zip(resps, cloned_reqs):
-            req.resps.append(x)
-        if lm.world_size > 1:
-            lm.accelerator.wait_for_everyone()
-    RANK = lm.rank
-    WORLD_SIZE = lm.world_size
-    ### Postprocess outputs ###
-    # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
-    for task_output, limit in zip(eval_tasks, limits):
-        task = task_output.task
-        task.apply_filters()
-        ### Collect values of metrics on all datapoints ###
-        # # unpack results and sort back in order and return control to Task
-        # TODO: make it possible to use a different metric per filter
-        # Pre-process task.instances to group by doc_id
-        instances_by_doc_id = defaultdict(list)
-        for instance in task.instances:
-            instances_by_doc_id[instance.doc_id].append(instance)
-        # Sort instances within each group
-        for instances in instances_by_doc_id.values():
-            instances.sort(key=lambda x: x.idx)
-        # iterate over different filters used
-        for filter_key in task.instances[0].filtered_resps.keys():
-            indices = (
-                samples.get(task_output.task_name, None)
-                if samples is not None
-                else None
-            )
-            doc_iterator = task.doc_iterator(
-                rank=RANK,
-                limit=limit,
-                world_size=WORLD_SIZE,
-                samples=indices,
-            )
-            for doc_id, doc in doc_iterator:
-                if indices:
-                    doc_id_true = indices[doc_id]
-                else:
-                    doc_id_true = doc_id
-                requests = instances_by_doc_id[doc_id]
-                metrics = task.process_results(
-                    doc, [req.filtered_resps[filter_key] for req in requests]
-                )
-                if log_samples:
-                    target = task.doc_to_target(doc)
-                    example = {
-                        "doc_id": doc_id_true,
-                        "doc": doc,
-                        "target": target,
-                        "arguments": [req.args for req in requests],
-                        "resps": [req.resps for req in requests],
-                        "filtered_resps": [
-                            req.filtered_resps[filter_key] for req in requests
-                        ],
-                        "filter": filter_key,
-                        "metrics": list(metrics.keys()),
-                        "doc_hash": hash_string(
-                            json.dumps(
-                                requests[0].doc,
-                                indent=2,
-                                default=handle_non_serializable,
-                                ensure_ascii=False,
-                            )
-                        ),
-                        "prompt_hash": hash_string(requests[0].arguments[0]),
-                        "target_hash": hash_string(str(target)),
-                    }
-                    example.update(metrics)
-                    task_output.logged_samples.append(example)
-                for metric, value in metrics.items():
-                    task_output.sample_metrics[(metric, filter_key)].append(value)
-    if WORLD_SIZE > 1:
-        # if multigpu, then gather data across all ranks to rank 0
-        # first gather logged samples across all ranks
-        for task_output in eval_tasks:
-            if log_samples:
-                # for task_name, task_samples in list(samples.items()):
-                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
-                torch.distributed.gather_object(
-                    obj=task_output.logged_samples,
-                    object_gather_list=full_samples,
-                    dst=0,
-                )
-                if RANK == 0:
-                    task_output.logged_samples = list(
-                        itertools.chain.from_iterable(full_samples)
-                    )
-            # then collect metrics across all ranks
-            for metrics in task_output.sample_metrics:
-                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
-                torch.distributed.gather_object(
-                    obj=task_output.sample_metrics[metrics],
-                    object_gather_list=metric_list,
-                    dst=0,
-                )
-                if RANK == 0:
-                    task_output.sample_metrics[metrics] = list(
-                        itertools.chain.from_iterable(metric_list)
-                    )
-    if RANK == 0:
-        ### Aggregate results over all datapoints ###
-        # aggregate results ; run bootstrap CIs
-        for task_output in eval_tasks:
-            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
-        (
-            results,
-            samples,
-            configs,
-            versions,
-            num_fewshot,
-            higher_is_better,
-        ) = consolidate_results(eval_tasks)
-        ### Calculate group metrics ###
-        if bool(results):
-            results, versions, show_group_table, *_ = consolidate_group_results(
-                results, versions, task_dict
-            )
-        results_agg, group_agg = prepare_print_tasks(task_dict, results)
-        subtask_list = get_subtask_list(task_dict)
-        # collect all higher_is_better values for metrics
-        # in the group's subtasks.
-        # TODO: clean this up ; unify with the below metric_list loop?
-        _higher_is_better = {}
-        for group, task_list in subtask_list.items():
-            if (
-                len(task_list) != 0
-            ):  # subtask list will list "task_name": [] for solo tasks
-                for task in task_list:
-                    for m, h in higher_is_better[task].items():
-                        if m not in _higher_is_better.keys():
-                            _higher_is_better[m] = h
-                        if (
-                            m in _higher_is_better
-                            and _higher_is_better[m] is not None
-                            and _higher_is_better[m] != h
-                        ):
-                            eval_logger.warning(
-                                f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None."
-                            )
-                            _higher_is_better[m] = None
-                higher_is_better[group] = _higher_is_better
-        results_dict = {
-            "results": dict(results_agg.items()),
-            **(
-                {"groups": dict(group_agg.items())}
-                if (bool(group_agg) & show_group_table)
-                else {}
-            ),
-            "group_subtasks": dict(reversed(subtask_list.items())),
-            "configs": dict(sorted(configs.items())),
-            "versions": dict(sorted(versions.items())),
-            "n-shot": dict(sorted(num_fewshot.items())),
-            "higher_is_better": dict(sorted(higher_is_better.items())),
-            "n-samples": {
-                task_output.task_name: {
-                    "original": len(task_output.task.eval_docs),
-                    "effective": min(
-                        limit if limit else len(task_output.task.eval_docs),
-                        len(task_output.task.eval_docs),
-                    ),
-                }
-                for task_output, limit in zip(eval_tasks, limits)
-            },
-        }
-        if log_samples:
-            results_dict["samples"] = dict(samples)
-        return results_dict
-    else:
-        return None
-def request_caching_arg_to_dict(cache_requests: str) -> dict:
-    request_caching_args = {
-        "cache_requests": cache_requests in {"true", "refresh"},
-        "rewrite_requests_cache": cache_requests == "refresh",
-        "delete_requests_cache": cache_requests == "delete",
-    }
-    return request_caching_args

lm-evaluation-harness/lm_eval/evaluator_utils.py DELETED Viewed

@@ -1,554 +0,0 @@
-import collections
-import logging
-import math
-import pathlib
-import sys
-from typing import List, Optional, Tuple, Union
-from lm_eval.api.group import ConfigurableGroup
-from lm_eval.api.metrics import (
-    aggregate_subtask_metrics,
-    mean,
-    pooled_sample_stderr,
-    stderr_for_metric,
-)
-from lm_eval.api.task import Task
-from lm_eval.utils import positional_deprecated
-eval_logger = logging.getLogger(__name__)
-class TaskOutput:
-    """
-    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.
-        Attributes:
-            task (object): The task object.
-            task_name (str): The name of the task.
-            task_config (dict): The configuration of the task.
-            version (str): The version of the task.
-            group_name (str): The name of the task group.
-            n_shot (int): The number of shots for the task.
-            task_alias (str): The alias of the task.
-            group_alias (str): The alias of the task group.
-            is_group (bool): Indicates if the task is a group.
-            logged_samples (list): The list of logged samples.
-            sample_len (int): The length of the samples.
-            sample_metrics (defaultdict): The dictionary of samples' metrics.
-            agg_metrics (defaultdict): The dictionary of aggregate metrics.
-        Methods:
-            from_taskdict(cls, task_name: str, task):
-                Creates a TaskOutput instance from a task dictionary.
-            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
-                Calculates the aggregate metrics for the task.
-    """
-    def __init__(
-        self,
-        task=None,
-        task_name=None,
-        task_config=None,
-        version=None,
-        group_name=None,
-        n_shot=None,
-        task_alias=None,
-        group_alias=None,
-        is_group=None,
-    ):
-        self.task = task
-        self.task_config = task_config
-        self.task_name = task_name
-        self.group_name = group_name
-        self.version = version
-        self.n_shot = n_shot
-        self.task_alias = task_alias
-        self.group_alias = group_alias
-        self.is_group = is_group
-        self.logged_samples = []
-        self.sample_len = None
-        self.sample_metrics = collections.defaultdict(list)
-        self.agg_metrics = collections.defaultdict(list)
-    @classmethod
-    def from_taskdict(cls, task_name: str, task):
-        if isinstance(task, tuple):
-            group_name, task = task
-        else:
-            group_name = None
-        if not task:
-            # these gets filtered out in get_task_list
-            # once they are added to group hierarchy
-            is_group = True
-            return cls(
-                task=task, task_name=task_name, is_group=is_group, group_name=group_name
-            )
-        version = task.VERSION
-        task_config = dict(task.dump_config())
-        if (n_shot := task_config.get("num_fewshot")) == 0:
-            n_shot = task_config.get("metadata", {}).get("num_fewshot", 0)
-        task_alias = task_config.get("alias")
-        group_alias = task_config.get("group_alias")
-        return cls(
-            task=task,
-            task_name=task_name,
-            task_config=task_config,
-            group_name=group_name,
-            version=version,
-            n_shot=n_shot,
-            task_alias=task_alias,
-            group_alias=group_alias,
-        )
-    def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
-        for (metric, filter_key), items in self.sample_metrics.items():
-            try:
-                agg_fn = self.task.aggregation()[metric]
-            except KeyError:
-                # This is when process results output an arbitrary metric
-                # TODO: Handle this better and allow other aggregate functions other than mean.
-                agg_fn = mean
-            metric_key = f"{metric},{filter_key}"
-            self.agg_metrics[metric_key] = agg_fn(items)
-            self.sample_len = len(items)  # TODO: same sample size for each metric?
-            if isinstance(bootstrap_iters, int):
-                stderr_fn = stderr_for_metric(
-                    metric=agg_fn,
-                    bootstrap_iters=min(bootstrap_iters, 100)
-                    if metric in ["bleu", "chrf", "ter"]
-                    else bootstrap_iters,
-                )
-                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = (
-                    stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
-                )
-            else:
-                raise ValueError(
-                    f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations."
-                )
-    def __repr__(self):
-        return (
-            f"TaskOutput(task_name={self.task_name}, "
-            f"group_name={self.group_name}, "
-            f"version={self.version}, "
-            f"n_shot={self.n_shot}, "
-            f"task_alias={self.task_alias}, "
-            f"group_alias={self.group_alias})"
-        )
-def get_task_list(task_dict: dict) -> List[TaskOutput]:
-    outputs = []
-    for task_name, task_obj in task_dict.items():
-        if isinstance(task_obj, dict):
-            _outputs = get_task_list(task_obj)
-            outputs.extend(_outputs)
-        else:
-            task_output = TaskOutput.from_taskdict(task_name, task_obj)
-            outputs.append(task_output)
-    return outputs
-def get_subtask_list(task_dict, task_root=None, depth=0):
-    subtask_list = {}
-    for group_obj, task_obj in task_dict.items():
-        if isinstance(group_obj, ConfigurableGroup):
-            # group_name = group_obj.group_name
-            group_name = group_obj.group_name
-        else:
-            group_name = group_obj
-        if isinstance(task_obj, dict):
-            _subtask_list = get_subtask_list(
-                task_obj, task_root=group_name, depth=depth + 1
-            )
-            if task_root:
-                subtask_list.setdefault((task_root, depth), []).extend(
-                    [
-                        _task
-                        for (_task, _depth) in _subtask_list.keys()
-                        if (_depth - 1) == depth
-                    ]
-                )
-            subtask_list = {**subtask_list, **_subtask_list}
-        else:
-            if isinstance(task_obj, ConfigurableGroup):
-                # group_or_task_name = task_obj.group_name
-                group_or_task_name = task_obj.group_name
-            elif isinstance(task_obj, Task):
-                # group_or_task_name = task_obj.task_name
-                group_or_task_name = task_obj.task_name
-            if task_root is None:
-                subtask_list.setdefault((group_or_task_name, depth), [])
-            else:
-                subtask_list.setdefault((task_root, depth), []).append(
-                    group_or_task_name
-                )
-    if depth == 0:
-        _subtask_list = {}
-        for group_key, task_list in subtask_list.items():
-            group_name, depth = group_key
-            _subtask_list[group_name] = task_list
-        subtask_list = _subtask_list
-    return subtask_list
-def print_writeout(task) -> None:
-    for inst in task.instances:
-        # print the prompt for the first few documents
-        if inst.doc_id < 1:
-            eval_logger.info(
-                f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\
-    \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
-            )
-            eval_logger.info(f"Request: {str(inst)}")
-def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
-    if limit is not None:
-        limit = (
-            int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
-        )
-    return limit
-def prepare_print_tasks(
-    task_dict: dict,
-    results: dict,
-    task_depth=0,
-    group_depth=0,
-) -> Tuple[dict, dict]:
-    """
-    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
-    value is a list of task names.
-    @param results: Dictionary containing the results of each task. Each key is a
-    group name and its value is a dictionary of task results.
-    @param task_depth: The indentation level for printing the task
-    hierarchy. Default is 0.
-    @param group_depth: The indentation level for printing the group
-    hierarchy. Default is 0.
-    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
-    aggregated results for each task, and groups_agg contains aggregated results for each group.
-    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
-    """
-    def _sort_task_dict(task_dict):
-        """
-        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
-        Required so that we end up sorting within each sub-header correctly.
-        """
-        return dict(
-            sorted(
-                task_dict.items(),
-                key=lambda item: item[0].group_name
-                if isinstance(item[0], ConfigurableGroup)
-                else item[0],
-            )
-        )
-    task_agg = collections.defaultdict(dict)
-    group_agg = collections.defaultdict(dict)
-    task_dict = _sort_task_dict(task_dict)
-    for task_or_group_name, task_or_group_obj in task_dict.items():
-        tab_string = " " * task_depth + "- " if task_depth > 0 else ""
-        if isinstance(task_or_group_name, ConfigurableGroup):
-            # string_name = task_or_group_name.group_name
-            name = task_or_group_name.group_name
-            from_configurable_group = True
-            task_or_group_obj = _sort_task_dict(task_or_group_obj)
-        elif isinstance(task_or_group_name, str):
-            name = task_or_group_name
-            if isinstance(task_or_group_obj, Task):
-                # string_name = task_or_group_obj.task_name
-                name = task_or_group_obj.task_name
-            from_configurable_group = False
-        task_agg[name] = results[name].copy()
-        if from_configurable_group:
-            if task_or_group_name.group_alias is not None:
-                alias = task_or_group_name.group_alias
-            else:
-                alias = task_or_group_name.group
-        else:
-            if "alias" in task_agg[name]:
-                alias = task_agg[name]["alias"]
-            else:
-                alias = name
-        task_agg[name]["alias"] = tab_string + alias
-        if "samples" in task_agg[name]:
-            task_agg[name].pop("samples")
-        if from_configurable_group and (" " not in results[name]):
-            group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
-            group_agg[name] = results[name].copy()
-            group_agg[name]["alias"] = group_tab_string + alias
-            if "samples" in group_agg[name]:
-                group_agg[name].pop("samples")
-        if isinstance(task_or_group_obj, dict):
-            task_depth += 1
-            group_depth += 1
-            _task_agg, _group_agg = prepare_print_tasks(
-                task_or_group_obj, results, task_depth, group_depth
-            )
-            task_agg = {
-                **task_agg,
-                **_task_agg,
-            }
-            group_agg = {**group_agg, **_group_agg}
-            task_depth -= 1
-            group_depth -= 1
-    return task_agg, group_agg
-def consolidate_results(
-    eval_tasks: List[TaskOutput],
-) -> Tuple[dict, dict, dict, dict, dict, dict]:
-    """
-    @param eval_tasks: list(TaskOutput).
-    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
-    Consolidates the results of multiple evaluation tasks into a single structure.
-    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
-    results structure. The consolidated results structure has the following properties:
-    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
-    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
-    aliases specified in the task configuration.
-    - samples: A defaultdict with task names as keys and lists of log samples as values.
-    - configs: A defaultdict with task names as keys and task configurations as values.
-    - versions: A defaultdict with task names as keys and task versions as values.
-    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
-    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
-    for each metric as values.
-    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
-    """
-    # stores the final result for each task, for each metric/filter pair.
-    results = collections.defaultdict(dict)
-    # logs info about each document evaluated.
-    samples = collections.defaultdict(list)
-    # store num-fewshot value per task
-    num_fewshot = collections.defaultdict(int)
-    # Tracks the YAML configs of all chosen task
-    configs = collections.defaultdict(dict)
-    # Tracks each task's version.
-    versions = collections.defaultdict(dict)
-    # Track `higher_is_better` for each metric
-    higher_is_better = collections.defaultdict(dict)
-    for task_output in eval_tasks:
-        if "task_alias" in (task_config := task_output.task_config):
-            results[task_output.task_name]["alias"] = task_config["task_alias"]
-        else:
-            results[task_output.task_name]["alias"] = task_output.task_name
-        if group_alias := task_output.group_alias:
-            if group_alias not in results and (group_name := task_output.group_name):
-                results[group_name]["alias"] = group_alias
-        num_fewshot[task_output.task_name] = task_output.n_shot
-        configs[task_output.task_name] = task_output.task_config
-        versions[task_output.task_name] = task_output.version
-        samples[task_output.task_name] = task_output.logged_samples
-        higher_is_better[task_output.task_name] = task_output.task.higher_is_better()
-        for (metric, filter_key), items in task_output.sample_metrics.items():
-            metric_key = f"{metric},{filter_key}"
-            results[task_output.task_name][metric_key] = task_output.agg_metrics[
-                metric_key
-            ]
-            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
-                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-            )
-    return results, samples, configs, versions, num_fewshot, higher_is_better
-def consolidate_group_results(
-    results,
-    versions,
-    task_dict,
-    task_root=None,
-    show_group_table=False,
-    task_aggregation_list=None,
-) -> Tuple[dict, dict, bool, Union[None,]]:
-    """
-    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
-    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:
-    - results: A defaultdict with task names (and, after this function is called, group names of
-    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
-    - versions: A defaultdict with task names (and, after this function is called, group names of
-    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
-    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
-    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.
-    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
-    In the top-level invocation of this function, task_aggregation_list is ignored.
-    """
-    if task_root is None:
-        task_root = {}
-    if task_aggregation_list is None:
-        task_aggregation_list = {}
-    for group_or_task, group_or_task_info in task_dict.items():
-        # Convert to string
-        if isinstance(group_or_task, ConfigurableGroup):
-            group_config = group_or_task.config
-            group_or_task = group_or_task.group_name
-        else:
-            group_config = None
-        if isinstance(group_or_task_info, Task):
-            if task_root:
-                task_aggregation_list.setdefault(task_root, []).append(
-                    group_or_task_info.task_name
-                )
-        else:
-            (
-                results,
-                versions,
-                show_group_table,
-                _task_aggregation_list,
-            ) = consolidate_group_results(
-                results,
-                versions,
-                group_or_task_info,
-                group_or_task,
-                show_group_table,
-                task_aggregation_list,
-            )
-            if task_root:
-                task_aggregation_list.setdefault(task_root, []).extend(
-                    task_aggregation_list.get(group_or_task, [])
-                )
-            if (group_config is None) or (
-                group_config["aggregate_metric_list"] is None
-            ):
-                results[group_or_task][" "] = " "
-                continue
-            if "aggregate_metric_list" in group_config:
-                agg_metric_list = group_config["aggregate_metric_list"]
-            show_group_table = show_group_table | bool(
-                group_config["aggregate_metric_list"]
-            )
-            task_list = _task_aggregation_list[group_or_task]
-            metric_list = list(
-                {
-                    key
-                    for task in task_list
-                    for key in results[task].keys()
-                    if "_stderr" not in key and key not in ["task", "alias", "samples"]
-                }
-            )
-            for metric in metric_list:
-                stderr = "_stderr,".join(metric.split(","))
-                # gather metrics, sizes, and stderrs from subtasks
-                metrics = [
-                    results[task][metric]
-                    for task in task_list
-                    if metric in results[task]
-                ]  # TODO: copy?
-                stderrs = [
-                    results[task][stderr]
-                    for task in task_list
-                    if stderr in results[task]
-                ]
-                sizes = [
-                    results[task]["samples"]
-                    for task in task_list
-                    if metric in results[task]
-                ]
-                for metric_config in agg_metric_list:
-                    for filter_name in metric_config["filter_list"]:
-                        if metric != ",".join([metric_config["metric"], filter_name]):
-                            continue
-                        # compute group's pooled metric and stderr
-                        if metric_config["aggregation"] == "mean":
-                            aggregate_fn = aggregate_subtask_metrics
-                        elif callable(metric_config["aggregation"]):
-                            aggregate_fn = metric_config["aggregation"]
-                        else:
-                            raise ValueError(
-                                f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'"
-                            )
-                        results[group_or_task][metric] = aggregate_fn(
-                            metrics,
-                            sizes,
-                            metric_config["weight_by_size"],
-                        )
-                        # TODO: calculate groups' metrics using arbitrary agg fns
-                        if "N/A" in stderrs:
-                            results[group_or_task][stderr] = "N/A"
-                        else:
-                            # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere
-                            results[group_or_task][stderr] = pooled_sample_stderr(
-                                stderrs, sizes
-                            )
-                results[group_or_task]["samples"] = sum(sizes)
-                group_metadata = group_config.get("metadata", None)
-                if group_metadata is not None:
-                    versions[group_or_task] = group_metadata.get("version", None)
-    # print(results)
-    return results, versions, show_group_table, task_aggregation_list
-@positional_deprecated
-def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
-    """
-    Search upward in the directory tree to a maximum of three layers
-    to find and return the package root (containing the 'tests' folder)
-    """
-    cur_path = start_path.resolve()
-    max_layers = 3
-    for _ in range(max_layers):
-        if (cur_path / "tests" / "test_version_stable.py").exists():
-            return cur_path
-        else:
-            cur_path = cur_path.parent.resolve()
-    raise FileNotFoundError(
-        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
-    )
-@positional_deprecated
-def run_task_tests(task_list: List[str]):
-    """
-    Find the package root and run the tests for the given tasks
-    """
-    import pytest
-    package_root = find_test_root(start_path=pathlib.Path(__file__))
-    task_string = " or ".join(task_list)
-    args = [
-        f"{package_root}/tests/test_version_stable.py",
-        f"--rootdir={package_root}",
-        "-k",
-        f"{task_string}",
-    ]
-    sys.path.append(str(package_root))
-    pytest_return_val = pytest.main(args)
-    if pytest_return_val:
-        raise ValueError(
-            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
-        )

lm-evaluation-harness/lm_eval/filters/__init__.py DELETED Viewed

@@ -1,25 +0,0 @@
-from functools import partial
-from typing import List
-from lm_eval.api.filter import FilterEnsemble
-from lm_eval.api.registry import get_filter
-from . import custom, extraction, selection, transformation
-def build_filter_ensemble(
-    filter_name: str, components: List[List[str]]
-) -> FilterEnsemble:
-    """
-    Create a filtering pipeline.
-    """
-    filters = []
-    for function, kwargs in components:
-        if kwargs is None:
-            kwargs = {}
-        # create a filter given its name in the registry
-        f = partial(get_filter(function), **kwargs)
-        # add the filter as a pipeline step
-        filters.append(f)
-    return FilterEnsemble(name=filter_name, filters=filters)

lm-evaluation-harness/lm_eval/filters/custom.py DELETED Viewed

@@ -1,17 +0,0 @@
-from lm_eval.api.filter import Filter
-from lm_eval.api.registry import register_filter
-@register_filter("custom")
-class CustomFilter(Filter):
-    """
-    Custom filter that applies a custom, user-defined function to the model responses.
-    """
-    def __init__(self, **kwargs) -> None:
-        self.filter_fn = kwargs.pop("filter_fn")
-        super().__init__(**kwargs)
-    def apply(self, resps, docs):
-        return self.filter_fn(resps, docs)

lm-evaluation-harness/lm_eval/filters/decontamination.py DELETED Viewed

@@ -1,25 +0,0 @@
-from lm_eval.api.filter import Filter
-from lm_eval.api.registry import register_filter
-@register_filter("decontaminate")
-class DecontaminationFilter(Filter):
-    """
-    A filter which evaluates
-    """
-    name = "track_decontamination"
-    def __init__(self, path) -> None:
-        """
-        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
-        should further cache result on a given (task_name, doc_id)
-        """
-        self._decontam_results = None
-    def apply(self, resps, docs) -> None:
-        """
-        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
-        """
-        pass

lm-evaluation-harness/lm_eval/filters/extraction.py DELETED Viewed

@@ -1,233 +0,0 @@
-import re
-import sys
-import unicodedata
-from lm_eval.api.filter import Filter
-from lm_eval.api.registry import register_filter
-@register_filter("regex")
-class RegexFilter(Filter):
-    """A filter that extracts values from text using regex pattern matching.
-    This filter applies a regex pattern to each model response and extracts matched values.
-    If no match is found, returns a fallback value. Useful for extracting structured data
-    (like numbers) from unstructured model outputs.
-    """
-    def __init__(
-        self,
-        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
-        group_select: int = 0,
-        fallback: str = "[invalid]",
-    ) -> None:
-        """
-        pass a string `regex` to run `re.compile(r"regex")` on.
-        `fallback` defines the output returned if no matches for the regex are located.
-        """
-        self.regex_pattern = regex_pattern
-        self.regex = re.compile(regex_pattern)
-        self.group_select = group_select
-        self.fallback = fallback
-    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
-        # here, we assume we have a list, in which each element is
-        # a list of model responses for some particular input/target pair.
-        # so we process each of these (same input/target response sets)
-        # independently (and keep them a list.)
-        def filter_set(inst):
-            filtered = []
-            for resp in inst:
-                match = self.regex.findall(resp)
-                if match:
-                    match = match[self.group_select]
-                    if isinstance(match, tuple):
-                        match = [m for m in match if m]
-                        if match:
-                            match = match[0]
-                        else:
-                            match = self.fallback
-                    match = match.strip()
-                else:
-                    match = self.fallback
-                filtered.append(match)
-            return filtered
-        filtered_resps = list(map(lambda x: filter_set(x), resps))
-        return filtered_resps
-@register_filter("regex_pos")
-class POSFilter(Filter):
-    """ """
-    def __init__(
-        self,
-        regex_pattern: str = r"\['(.*?)'\]",
-        group_select=0,
-        fallback=None,
-    ) -> None:
-        """
-        pass a string `regex` to run `re.compile(r"regex")` on.
-        `fallback` defines the output returned if no matches for the regex are located.
-        """
-        if fallback is None:
-            fallback = ["invalid"]
-        self.regex_pattern = regex_pattern
-        self.regex = re.compile(regex_pattern)
-        self.group_select = group_select
-        self.fallback = fallback
-    def apply(self, resps, docs):
-        def extract_tagged_tokens(text):
-            # Extract tagged tokens list from text input using regex
-            tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
-            return [(token, pos) for token, pos in tokens]
-        def extract_pos_tags(result):
-            pos_tags = []
-            if isinstance(result, str):
-                result = extract_tagged_tokens(result)
-            pos_tags.extend(pos for _, pos in result)
-            return pos_tags if pos_tags else self.fallback
-        def filter_set(inst):
-            filtered = []
-            for resp in inst:
-                match = extract_pos_tags(resp)
-                filtered.append(match)
-            return filtered
-        filtered_resps = map(lambda x: filter_set(x), resps)
-        return filtered_resps
-@register_filter("remove_whitespace")
-class WhitespaceFilter(Filter):
-    """Filters out leading whitespace from responses."""
-    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
-        def filter_set(inst):
-            filtered_resp = []
-            for resp in inst:
-                resp = resp.lstrip()
-                filtered_resp.append(resp)
-            return filtered_resp
-        filtered_resps = [filter_set(resp) for resp in resps]
-        return filtered_resps
-@register_filter("multi_choice_regex")
-class MultiChoiceRegexFilter(RegexFilter):
-    """
-    A filter used to extract a model's answer on multiple choice questions with
-    letter answers. assumes each document has a "choices" field
-    containing the list of answer choices and that the answer label symbols
-    are of the form (A), (B), (C), ... or A, B, C.
-    """
-    def __init__(
-        self,
-        regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
-        group_select=0,
-        fallback: str = "[invalid]",
-        ignore_case=False,
-        ignore_punctuation=False,
-        regexes_to_ignore=None,
-    ) -> None:
-        """
-        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
-                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
-                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
-        group_select: Selects the (group_select)th match from the findall result.
-        ignore_case: Ignores the case during step 1 matching
-        ignore_punctuation: Remove the punctuation during step 1 matching
-        regexes_to_ignore: Remove these regexes during step 1 matching
-        """
-        super().__init__(regex_pattern, group_select, fallback)
-        self.ignore_case = ignore_case
-        self.ignore_punctuation = ignore_punctuation
-        self.regexes_to_ignore = regexes_to_ignore
-    def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
-        # here, we assume we have a list, in which each element is
-        # a list of model responses for some particular input/target pair.
-        # so we process each of these (same input/target response sets)
-        # independently (and keep them a list.)
-        def find_match(regex, resp, convert_dict={}):
-            match = regex.findall(resp)
-            if match:
-                match = match[self.group_select]
-                if isinstance(match, tuple):
-                    match = [m for m in match if m][0]
-                match = match.strip()
-                if match and match in convert_dict:
-                    match = convert_dict[match]
-            return match
-        punct_tbl = dict.fromkeys(
-            i
-            for i in range(sys.maxunicode)
-            if unicodedata.category(chr(i)).startswith("P")
-        )
-        def filter_ignores(st):
-            if self.regexes_to_ignore is not None:
-                for s in self.regexes_to_ignore:
-                    st = re.sub(s, "", st)
-            if self.ignore_case:
-                st = st.lower()
-            if self.ignore_punctuation:
-                # https://stackoverflow.com/a/266162
-                st = st.translate(punct_tbl)
-            return st
-        filtered_resps = []
-        for r, doc in zip(resps, docs):
-            fallback_regexes = []
-            choice_to_alpha = {}
-            next_alpha = "A"
-            without_paren_fallback_regexes = []
-            without_paren_to_target = {}
-            choices = doc["choices"]
-            for c in choices:
-                m = filter_ignores(c.strip())
-                fallback_regexes.append(f"{re.escape(m)}")
-                choice_to_alpha[m] = f"({next_alpha})"
-                without_paren_fallback_regexes.append(next_alpha)
-                without_paren_to_target[next_alpha] = f"({next_alpha})"
-                next_alpha = chr(ord(next_alpha) + 1)
-            fallback_regex = re.compile("|".join(fallback_regexes))
-            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
-            without_paren_fallback_regex = re.compile(
-                rf":[\s]*({without_paren_fallback_regex})"
-            )
-            filtered = []
-            for resp in r:
-                match = find_match(self.regex, resp)
-                if not match:
-                    match = find_match(
-                        fallback_regex, filter_ignores(resp), choice_to_alpha
-                    )
-                    if not match:
-                        match = find_match(
-                            without_paren_fallback_regex, resp, without_paren_to_target
-                        )
-                if not match:
-                    match = self.fallback
-                filtered.append(match)
-            filtered_resps.append(filtered)
-        return filtered_resps

lm-evaluation-harness/lm_eval/filters/selection.py DELETED Viewed

@@ -1,61 +0,0 @@
-from collections import Counter
-from lm_eval.api.filter import Filter
-from lm_eval.api.registry import register_filter
-# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
-# that takes an input and returns a scalar and then should select the max reward,
-# or should implement different filters for different ways of handling a reward model's inference.
-@register_filter("take_first")
-class TakeFirstFilter(Filter):
-    def __init__(self) -> None:
-        """
-        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
-        """
-    def apply(self, resps, docs):
-        """
-        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
-        """
-        return map(lambda r: r[0], resps)
-@register_filter("take_first_k")
-class TakeKFilter(Filter):
-    def __init__(self, **kwargs) -> None:
-        self.k = kwargs.pop("k")
-        super().__init__(**kwargs)
-    def apply(self, resps, docs):
-        # need resp to be subscriptable to check below
-        resps = list(resps)
-        # check we have at least k responses per doc, else we can't take the first k
-        assert len(resps[0]) >= self.k, (
-            f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
-        )
-        return map(lambda r: r[: self.k], resps)
-@register_filter("majority_vote")
-class MajorityVoteFilter(Filter):
-    def __init__(self) -> None:
-        """
-        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
-        """
-    def apply(self, resps, docs):
-        """
-        Each entry of `resps` is a list of model responses.
-        We select the response that occurs most frequently in each entry of `resps`.
-        """
-        def select_majority(resp):
-            counts = Counter(resp)
-            vote = counts.most_common(1)[0][0]
-            return vote
-        return map(lambda r: [select_majority(r)], resps)

lm-evaluation-harness/lm_eval/filters/transformation.py DELETED Viewed

@@ -1,122 +0,0 @@
-import re
-from lm_eval.api.filter import Filter
-from lm_eval.api.registry import register_filter
-@register_filter("lowercase")
-class LowercaseFilter(Filter):
-    def __init__(self) -> None:
-        pass
-    def apply(self, resps, docs):
-        def filter_set(inst):
-            return [resp.lower() for resp in inst]
-        return [filter_set(resp) for resp in resps]
-@register_filter("uppercase")
-class UppercaseFilter(Filter):
-    def __init__(self) -> None:
-        pass
-    def apply(self, resps, docs):
-        def filter_set(inst):
-            return [resp.upper() for resp in inst]
-        return [filter_set(resp) for resp in resps]
-@register_filter("map")
-class MapFilter(Filter):
-    def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
-        """
-        Initializes the MapFilter with a given mapping dictionary and default value.
-        Args:
-        - mapping_dict (dict): A dictionary containing the key-value mappings.
-                               Default is an empty dictionary.
-        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
-                               Default is None.
-        Example:
-        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
-        """
-        if mapping_dict is None:
-            mapping_dict = {}
-        assert isinstance(mapping_dict, dict), (
-            "Provided mapping_dict is not a dictionary"
-        )
-        self.mapping_dict = mapping_dict
-        self.default_value = default_value
-    def apply(self, resps, docs):
-        def filter_set(inst):
-            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
-        return [filter_set(resp) for resp in resps]
-@register_filter("format_span")
-class SPANFilter(Filter):
-    def __init__(self) -> None:
-        pass
-    def apply(self, resps, docs):
-        def format_ner_text(text):
-            label_dict = {
-                "person": "PER",
-                "location": "LOC",
-                "organization": "ORG",
-                "counties": "LOC",
-                "places": "LOC",
-                "people": "PER",
-                "persons": "PER",
-                "company": "ORG",
-                "country": "LOC",
-                "continent": "LOC",
-                "time": "DATE",
-                "date": "DATE",
-                "per": "PER",
-                "loc": "LOC",
-                "org": "ORG",
-            }
-            text = text.lower()
-            for key, value in label_dict.items():
-                text = text.replace(key, value)
-            text = "$".join(i for i in text.split("$$"))
-            return text.rstrip("$$")
-        def format_named_entities(text):
-            """
-            Extract named entities from text and format them as 'label: value $$ label: value'.
-            Handles grouped entities (e.g., LOC: kenya, uganda) and excludes 'none' values.
-            """
-            # Regular expression to match label: entities pattern
-            pattern = r"\b(PER|LOC|ORG|DATE):\s*([^$]+)"
-            # Normalize newline characters
-            text = text.replace("\n", "$").strip()
-            matches = re.findall(pattern, text)
-            formatted_entities = []
-            for label, values in matches:
-                # Split multiple entities separated by commas and strip whitespace
-                entities = [value.strip() for value in values.split(",")]
-                # Exclude 'none' entities
-                for entity in entities:
-                    if entity.lower() != "none":
-                        formatted_entities.append(f"{label.lower()}: {entity}")
-            # Join entities with the desired separator
-            return " $ ".join(formatted_entities)
-        def filter_set(inst):
-            return [
-                format_named_entities(format_ner_text(resp.lower())) for resp in inst
-            ]
-        return [filter_set(resp) for resp in resps]

lm-evaluation-harness/lm_eval/loggers/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .evaluation_tracker import EvaluationTracker
2	- from .wandb_logger import WandbLogger

lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py DELETED Viewed

@@ -1,537 +0,0 @@
-import json
-import logging
-import os
-import re
-import time
-from collections import defaultdict
-from dataclasses import asdict, dataclass
-from datetime import datetime
-from pathlib import Path
-from datasets import load_dataset
-from datasets.utils.metadata import MetadataConfigs
-from huggingface_hub import (
-    DatasetCard,
-    DatasetCardData,
-    HfApi,
-    hf_hub_url,
-)
-from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
-from lm_eval.utils import (
-    get_file_datetime,
-    get_file_task_name,
-    get_results_filenames,
-    get_sample_results_filenames,
-    handle_non_serializable,
-    hash_string,
-    sanitize_list,
-    sanitize_model_name,
-    sanitize_task_name,
-)
-eval_logger = logging.getLogger(__name__)
-@dataclass(init=False)
-class GeneralConfigTracker:
-    """
-    Tracker for the evaluation parameters.
-    Attributes:
-        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
-        model_name (str): Name of the model.
-        model_name_sanitized (str): Sanitized model name for directory creation.
-        start_time (float): Start time of the experiment. Logged at class init.
-        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
-        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
-    """
-    model_source: str = None
-    model_name: str = None
-    model_name_sanitized: str = None
-    system_instruction: str = None
-    system_instruction_sha: str = None
-    fewshot_as_multiturn: bool = None
-    chat_template: str = None
-    chat_template_sha: str = None
-    start_time: float = None
-    end_time: float = None
-    total_evaluation_time_seconds: str = None
-    def __init__(self) -> None:
-        """Starts the evaluation timer."""
-        self.start_time = time.perf_counter()
-    @staticmethod
-    def _get_model_name(model_args: str) -> str:
-        """Extracts the model name from the model arguments."""
-        def extract_model_name(model_args: str, key: str) -> str:
-            """Extracts the model name from the model arguments using a key."""
-            args_after_key = model_args.split(key)[1]
-            return args_after_key.split(",")[0]
-        # order does matter, e.g. peft and delta are provided together with pretrained
-        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
-        for prefix in prefixes:
-            if prefix in model_args:
-                return extract_model_name(model_args, prefix)
-        return ""
-    def log_experiment_args(
-        self,
-        model_source: str,
-        model_args: str,
-        system_instruction: str,
-        chat_template: str,
-        fewshot_as_multiturn: bool,
-    ) -> None:
-        """Logs model parameters and job ID."""
-        self.model_source = model_source
-        self.model_name = GeneralConfigTracker._get_model_name(model_args)
-        self.model_name_sanitized = sanitize_model_name(self.model_name)
-        self.system_instruction = system_instruction
-        self.system_instruction_sha = (
-            hash_string(system_instruction) if system_instruction else None
-        )
-        self.chat_template = chat_template
-        self.chat_template_sha = hash_string(chat_template) if chat_template else None
-        self.fewshot_as_multiturn = fewshot_as_multiturn
-    def log_end_time(self) -> None:
-        """Logs the end time of the evaluation and calculates the total evaluation time."""
-        self.end_time = time.perf_counter()
-        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
-class EvaluationTracker:
-    """
-    Keeps track and saves relevant information of the evaluation process.
-    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
-    """
-    def __init__(
-        self,
-        output_path: str = None,
-        hub_results_org: str = "",
-        hub_repo_name: str = "",
-        details_repo_name: str = "",
-        results_repo_name: str = "",
-        push_results_to_hub: bool = False,
-        push_samples_to_hub: bool = False,
-        public_repo: bool = False,
-        token: str = "",
-        leaderboard_url: str = "",
-        point_of_contact: str = "",
-        gated: bool = False,
-    ) -> None:
-        """
-        Creates all the necessary loggers for evaluation tracking.
-        Args:
-            output_path (str): Path to save the results. If not provided, the results won't be saved.
-            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
-            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
-            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
-            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
-            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
-            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
-            public_repo (bool): Whether to push the results to a public or private repository.
-            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
-            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
-            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
-            gated (bool): Whether to gate the repository.
-        """
-        self.general_config_tracker = GeneralConfigTracker()
-        self.output_path = output_path
-        self.push_results_to_hub = push_results_to_hub
-        self.push_samples_to_hub = push_samples_to_hub
-        self.public_repo = public_repo
-        self.leaderboard_url = leaderboard_url
-        self.point_of_contact = point_of_contact
-        self.api = HfApi(token=token) if token else None
-        self.gated_repo = gated
-        if not self.api and (push_results_to_hub or push_samples_to_hub):
-            raise ValueError(
-                "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
-                "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
-            )
-        if (
-            self.api
-            and hub_results_org == ""
-            and (push_results_to_hub or push_samples_to_hub)
-        ):
-            hub_results_org = self.api.whoami()["name"]
-            eval_logger.warning(
-                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
-            )
-        if hub_repo_name == "":
-            details_repo_name = (
-                details_repo_name if details_repo_name != "" else "lm-eval-results"
-            )
-            results_repo_name = (
-                results_repo_name if results_repo_name != "" else details_repo_name
-            )
-        else:
-            details_repo_name = hub_repo_name
-            results_repo_name = hub_repo_name
-            eval_logger.warning(
-                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
-            )
-        self.details_repo = f"{hub_results_org}/{details_repo_name}"
-        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
-        self.results_repo = f"{hub_results_org}/{results_repo_name}"
-        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
-    def save_results_aggregated(
-        self,
-        results: dict,
-        samples: dict,
-    ) -> None:
-        """
-        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
-        Args:
-            results (dict): The aggregated results to save.
-            samples (dict): The samples results to save.
-        """
-        self.general_config_tracker.log_end_time()
-        if self.output_path:
-            try:
-                eval_logger.info("Saving results aggregated")
-                # calculate cumulative hash for each task - only if samples are provided
-                task_hashes = {}
-                if samples:
-                    for task_name, task_samples in samples.items():
-                        sample_hashes = [
-                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
-                            for s in task_samples
-                        ]
-                        task_hashes[task_name] = hash_string("".join(sample_hashes))
-                # update initial results dict
-                results.update({"task_hashes": task_hashes})
-                results.update(asdict(self.general_config_tracker))
-                dumped = json.dumps(
-                    results,
-                    indent=2,
-                    default=handle_non_serializable,
-                    ensure_ascii=False,
-                )
-                path = Path(self.output_path if self.output_path else Path.cwd())
-                self.date_id = datetime.now().isoformat().replace(":", "-")
-                if path.suffix == ".json":
-                    path.parent.mkdir(parents=True, exist_ok=True)
-                    file_results_aggregated = path.with_name(
-                        f"{path.stem}_{self.date_id}.json"
-                    )
-                else:
-                    path = path.joinpath(
-                        self.general_config_tracker.model_name_sanitized
-                    )
-                    path.mkdir(parents=True, exist_ok=True)
-                    file_results_aggregated = path.joinpath(
-                        f"results_{self.date_id}.json"
-                    )
-                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
-                if self.api and self.push_results_to_hub:
-                    repo_id = (
-                        self.results_repo
-                        if self.public_repo
-                        else self.results_repo_private
-                    )
-                    self.api.create_repo(
-                        repo_id=repo_id,
-                        repo_type="dataset",
-                        private=not self.public_repo,
-                        exist_ok=True,
-                    )
-                    self.api.upload_file(
-                        repo_id=repo_id,
-                        path_or_fileobj=str(file_results_aggregated),
-                        path_in_repo=os.path.join(
-                            self.general_config_tracker.model_name,
-                            file_results_aggregated.name,
-                        ),
-                        repo_type="dataset",
-                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
-                    )
-                    eval_logger.info(
-                        "Successfully pushed aggregated results to the Hugging Face Hub. "
-                        f"You can find them at: {repo_id}"
-                    )
-            except Exception as e:
-                eval_logger.warning("Could not save results aggregated")
-                eval_logger.info(repr(e))
-        else:
-            eval_logger.info(
-                "Output path not provided, skipping saving results aggregated"
-            )
-    def save_results_samples(
-        self,
-        task_name: str,
-        samples: dict,
-    ) -> None:
-        """
-        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
-        Args:
-            task_name (str): The task name to save the samples for.
-            samples (dict): The samples results to save.
-        """
-        if self.output_path:
-            try:
-                eval_logger.info(f"Saving per-sample results for: {task_name}")
-                path = Path(self.output_path if self.output_path else Path.cwd())
-                if path.suffix == ".json":
-                    path = path.parent
-                else:
-                    path = path.joinpath(
-                        self.general_config_tracker.model_name_sanitized
-                    )
-                path.mkdir(parents=True, exist_ok=True)
-                file_results_samples = path.joinpath(
-                    f"samples_{task_name}_{self.date_id}.jsonl"
-                )
-                for sample in samples:
-                    # we first need to sanitize arguments and resps
-                    # otherwise we won't be able to load the dataset
-                    # using the datasets library
-                    arguments = {}
-                    for i, arg in enumerate(sample["arguments"]):
-                        arguments[f"gen_args_{i}"] = {}
-                        for j, tmp in enumerate(arg):
-                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
-                    sample["resps"] = sanitize_list(sample["resps"])
-                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
-                    sample["arguments"] = arguments
-                    sample["target"] = str(sample["target"])
-                    sample_dump = (
-                        json.dumps(
-                            sample,
-                            default=handle_non_serializable,
-                            ensure_ascii=False,
-                        )
-                        + "\n"
-                    )
-                    with open(file_results_samples, "a", encoding="utf-8") as f:
-                        f.write(sample_dump)
-                if self.api and self.push_samples_to_hub:
-                    repo_id = (
-                        self.details_repo
-                        if self.public_repo
-                        else self.details_repo_private
-                    )
-                    self.api.create_repo(
-                        repo_id=repo_id,
-                        repo_type="dataset",
-                        private=not self.public_repo,
-                        exist_ok=True,
-                    )
-                    try:
-                        if self.gated_repo:
-                            headers = build_hf_headers()
-                            r = get_session().put(
-                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
-                                headers=headers,
-                                json={"gated": "auto"},
-                            )
-                            hf_raise_for_status(r)
-                    except Exception as e:
-                        eval_logger.warning("Could not gate the repository")
-                        eval_logger.info(repr(e))
-                    self.api.upload_folder(
-                        repo_id=repo_id,
-                        folder_path=str(path),
-                        path_in_repo=self.general_config_tracker.model_name_sanitized,
-                        repo_type="dataset",
-                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
-                    )
-                    eval_logger.info(
-                        f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
-                        f"You can find them at: {repo_id}"
-                    )
-            except Exception as e:
-                eval_logger.warning("Could not save sample results")
-                eval_logger.info(repr(e))
-        else:
-            eval_logger.info("Output path not provided, skipping saving sample results")
-    def recreate_metadata_card(self) -> None:
-        """
-        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
-        """
-        eval_logger.info("Recreating metadata card")
-        repo_id = self.details_repo if self.public_repo else self.details_repo_private
-        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        results_files = get_results_filenames(files_in_repo)
-        sample_files = get_sample_results_filenames(files_in_repo)
-        # Build a dictionary to store the latest evaluation datetime for:
-        # - Each tested model and its aggregated results
-        # - Each task and sample results, if existing
-        # i.e. {
-        #     "org__model_name__gsm8k": "2021-09-01T12:00:00",
-        #     "org__model_name__ifeval": "2021-09-01T12:00:00",
-        #     "org__model_name__results": "2021-09-01T12:00:00"
-        # }
-        latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
-        for file_path in sample_files:
-            file_path = Path(file_path)
-            filename = file_path.name
-            model_name = file_path.parent
-            task_name = get_file_task_name(filename)
-            results_datetime = get_file_datetime(filename)
-            task_name_sanitized = sanitize_task_name(task_name)
-            # Results and sample results for the same model and task will have the same datetime
-            samples_key = f"{model_name}__{task_name_sanitized}"
-            results_key = f"{model_name}__results"
-            latest_datetime = max(
-                latest_task_results_datetime[samples_key],
-                results_datetime,
-            )
-            latest_task_results_datetime[samples_key] = latest_datetime
-            latest_task_results_datetime[results_key] = max(
-                latest_task_results_datetime[results_key],
-                latest_datetime,
-            )
-        # Create metadata card
-        card_metadata = MetadataConfigs()
-        # Add the latest aggregated results to the metadata card for easy access
-        for file_path in results_files:
-            file_path = Path(file_path)
-            results_filename = file_path.name
-            model_name = file_path.parent
-            eval_date = get_file_datetime(results_filename)
-            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
-            results_filename = Path("**") / Path(results_filename).name
-            config_name = f"{model_name}__results"
-            sanitized_last_eval_date_results = re.sub(
-                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
-            )
-            if eval_date_sanitized == sanitized_last_eval_date_results:
-                # Ensure that all results files are listed in the metadata card
-                current_results = card_metadata.get(config_name, {"data_files": []})
-                current_results["data_files"].append(
-                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
-                )
-                card_metadata[config_name] = current_results
-                # If the results file is the newest, update the "latest" field in the metadata card
-                card_metadata[config_name]["data_files"].append(
-                    {"split": "latest", "path": [str(results_filename)]}
-                )
-        # Add the tasks details configs
-        for file_path in sample_files:
-            file_path = Path(file_path)
-            filename = file_path.name
-            model_name = file_path.parent
-            task_name = get_file_task_name(filename)
-            eval_date = get_file_datetime(filename)
-            task_name_sanitized = sanitize_task_name(task_name)
-            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
-            results_filename = Path("**") / Path(filename).name
-            config_name = f"{model_name}__{task_name_sanitized}"
-            sanitized_last_eval_date_results = re.sub(
-                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
-            )
-            if eval_date_sanitized == sanitized_last_eval_date_results:
-                # Ensure that all sample results files are listed in the metadata card
-                current_details_for_task = card_metadata.get(
-                    config_name, {"data_files": []}
-                )
-                current_details_for_task["data_files"].append(
-                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
-                )
-                card_metadata[config_name] = current_details_for_task
-                # If the samples results file is the newest, update the "latest" field in the metadata card
-                card_metadata[config_name]["data_files"].append(
-                    {"split": "latest", "path": [str(results_filename)]}
-                )
-        # Get latest results and extract info to update metadata card examples
-        latest_datetime = max(latest_task_results_datetime.values())
-        latest_model_name = max(
-            latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
-        )
-        last_results_file = [
-            f for f in results_files if latest_datetime.replace(":", "-") in f
-        ][0]
-        last_results_file_path = hf_hub_url(
-            repo_id=repo_id, filename=last_results_file, repo_type="dataset"
-        )
-        latest_results_file = load_dataset(
-            "json", data_files=last_results_file_path, split="train"
-        )
-        results_dict = latest_results_file["results"][0]
-        new_dictionary = {"all": results_dict}
-        new_dictionary.update(results_dict)
-        results_string = json.dumps(new_dictionary, indent=4)
-        dataset_summary = (
-            "Dataset automatically created during the evaluation run of model "
-        )
-        if self.general_config_tracker.model_source == "hf":
-            dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
-        else:
-            dataset_summary += f"{self.general_config_tracker.model_name}\n"
-        dataset_summary += (
-            f"The dataset is composed of {len(card_metadata) - 1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
-            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
-            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
-            'An additional configuration "results" store all the aggregated results of the run.\n\n'
-            "To load the details from a run, you can for instance do the following:\n"
-        )
-        if self.general_config_tracker.model_source == "hf":
-            dataset_summary += (
-                "```python\nfrom datasets import load_dataset\n"
-                f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
-            )
-        dataset_summary += (
-            "## Latest results\n\n"
-            f"These are the [latest results from run {latest_datetime}]({last_results_file_path.replace('/resolve/', '/blob/')}) "
-            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
-            'You find each in the results and the "latest" split for each eval):\n\n'
-            f"```python\n{results_string}\n```"
-        )
-        card_data = DatasetCardData(
-            dataset_summary=dataset_summary,
-            repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
-            pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
-            leaderboard_url=self.leaderboard_url,
-            point_of_contact=self.point_of_contact,
-        )
-        card_metadata.to_dataset_card_data(card_data)
-        card = DatasetCard.from_template(
-            card_data,
-            pretty_name=card_data.pretty_name,
-        )
-        card.push_to_hub(repo_id, repo_type="dataset")

lm-evaluation-harness/lm_eval/loggers/utils.py DELETED Viewed

@@ -1,149 +0,0 @@
-import logging
-import os
-import re
-import subprocess
-from importlib.metadata import version
-from pathlib import Path
-from typing import Any, Dict, Optional, Tuple, Union
-import numpy as np
-from torch.utils.collect_env import get_pretty_env_info
-from transformers import __version__ as trans_version
-logger = logging.getLogger(__name__)
-def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
-    """Remove the ',none' substring from the input_string if it exists at the end.
-    Args:
-        input_string (str): The input string from which to remove the ',none' substring.
-    Returns:
-        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
-                          and a boolean indicating whether the modification was made (True) or not (False).
-    """
-    # Define the pattern to match ',none' at the end of the string
-    pattern = re.compile(r",none$")
-    # Use sub() to replace ',none' with an empty string
-    result = re.sub(pattern, "", input_string)
-    # check if the input_string changed
-    removed = result != input_string
-    return result, removed
-def _handle_non_serializable(o: Any) -> Union[int, str, list]:
-    """Handle non-serializable objects by converting them to serializable types.
-    Args:
-        o (Any): The object to be handled.
-    Returns:
-        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
-            it will be converted to int. If the object is of type set, it will be converted
-            to a list. Otherwise, it will be converted to str.
-    """
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
-def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
-    try:
-        git_folder = Path(repo_path, ".git")
-        if git_folder.is_file():
-            git_folder = Path(
-                git_folder.parent,
-                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
-            )
-        if Path(git_folder, "HEAD").exists():
-            head_name = (
-                Path(git_folder, "HEAD")
-                .read_text(encoding="utf-8")
-                .split("\n")[0]
-                .split(" ")[-1]
-            )
-            head_ref = Path(git_folder, head_name)
-            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
-        else:
-            git_hash = None
-    except Exception as err:
-        logger.debug(
-            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
-        )
-        return None
-    return git_hash
-def get_git_commit_hash():
-    """
-    Gets the git commit hash of your current repo (if it exists).
-    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
-    """
-    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
-        git_hash = git_hash.decode()
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        # FileNotFoundError occurs when git not installed on system
-        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
-    return git_hash
-def add_env_info(storage: Dict[str, Any]):
-    try:
-        pretty_env_info = get_pretty_env_info()
-    except Exception as err:
-        pretty_env_info = str(err)
-    try:
-        lm_eval_version = version("lm_eval")
-    except Exception as err:
-        lm_eval_version = str(err)
-    transformers_version = trans_version
-    upper_dir_commit = get_commit_from_path(
-        Path(os.getcwd(), "..")
-    )  # git hash of upper repo if exists
-    added_info = {
-        "pretty_env_info": pretty_env_info,
-        "transformers_version": transformers_version,
-        "lm_eval_version": lm_eval_version,
-        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
-    }
-    storage.update(added_info)
-def add_tokenizer_info(storage: Dict[str, Any], lm):
-    if getattr(lm, "tokenizer", False):
-        try:
-            tokenizer_info = {
-                "tokenizer_pad_token": [
-                    lm.tokenizer.pad_token,
-                    str(lm.tokenizer.pad_token_id),
-                ],
-                "tokenizer_eos_token": [
-                    lm.tokenizer.eos_token,
-                    str(lm.tokenizer.eos_token_id),
-                ],
-                "tokenizer_bos_token": [
-                    lm.tokenizer.bos_token,
-                    str(lm.tokenizer.bos_token_id),
-                ],
-                "eot_token_id": getattr(lm, "eot_token_id", None),
-                "max_length": getattr(lm, "max_length", None),
-            }
-            storage.update(tokenizer_info)
-        except Exception as err:
-            logger.debug(
-                f"Logging detailed tokenizer info failed with {err}, skipping..."
-            )
-        # seems gguf and textsynth do not have tokenizer
-    else:
-        logger.debug(
-            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
-        )

lm-evaluation-harness/lm_eval/loggers/wandb_logger.py DELETED Viewed

@@ -1,358 +0,0 @@
-import copy
-import json
-import logging
-from typing import Any, Dict, List, Literal, Tuple
-import numpy as np
-import pandas as pd
-from packaging.version import Version
-from lm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
-logger = logging.getLogger(__name__)
-def get_wandb_printer() -> Literal["Printer"]:
-    """Returns a wandb printer instance for pretty stdout."""
-    from wandb.sdk.lib.printer import new_printer
-    printer = new_printer()
-    return printer
-class WandbLogger:
-    def __init__(self, init_args=None, config_args=None) -> None:
-        """Attaches to wandb logger if already initialized. Otherwise, passes init_args to wandb.init() and config_args to wandb.config.update()
-        Args:
-            init_args Optional[Dict]: Arguments for init configuration.
-            config_args Optional[Dict]: Arguments for config
-        Parse and log the results returned from evaluator.simple_evaluate() with:
-            wandb_logger.post_init(results)
-            wandb_logger.log_eval_result()
-            wandb_logger.log_eval_samples(results["samples"])
-        """
-        try:
-            import wandb
-            assert Version(wandb.__version__) >= Version("0.13.6")
-            if Version(wandb.__version__) < Version("0.13.6"):
-                wandb.require("report-editing:v0")
-        except Exception as e:
-            logger.warning(
-                "To use the wandb reporting functionality please install wandb>=0.13.6.\n"
-                "To install the latest version of wandb run `pip install wandb --upgrade`\n"
-                f"{e}"
-            )
-        self.wandb_args: Dict[str, Any] = init_args or {}
-        self.wandb_config_args: Dict[str, Any] = config_args or {}
-        # pop the step key from the args to save for all logging calls
-        self.step = self.wandb_args.pop("step", None)
-        # initialize a W&B run
-        if wandb.run is None:
-            self.run = wandb.init(**self.wandb_args)
-            if self.wandb_config_args:
-                self.run.config.update(self.wandb_config_args)
-        else:
-            self.run = wandb.run
-        self.printer = get_wandb_printer()
-    def post_init(self, results: Dict[str, Any]) -> None:
-        self.results: Dict[str, Any] = copy.deepcopy(results)
-        self.task_names: List[str] = list(results.get("results", {}).keys())
-        self.group_names: List[str] = list(results.get("groups", {}).keys())
-    def _get_config(self) -> Dict[str, Any]:
-        """Get configuration parameters."""
-        self.task_configs = self.results.get("configs", {})
-        cli_configs = self.results.get("config", {})
-        configs = {
-            "task_configs": self.task_configs,
-            "cli_configs": cli_configs,
-        }
-        return configs
-    def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
-        """Sanitize the results dictionary."""
-        _results = copy.deepcopy(self.results.get("results", dict()))
-        # Remove None from the metric string name
-        tmp_results = copy.deepcopy(_results)
-        for task_name in self.task_names:
-            task_result = tmp_results.get(task_name, dict())
-            for metric_name, metric_value in task_result.items():
-                _metric_name, removed = remove_none_pattern(metric_name)
-                if removed:
-                    _results[task_name][_metric_name] = metric_value
-                    _results[task_name].pop(metric_name)
-        # remove string valued keys from the results dict
-        wandb_summary = {}
-        for task in self.task_names:
-            task_result = _results.get(task, dict())
-            for metric_name, metric_value in task_result.items():
-                if isinstance(metric_value, str):
-                    wandb_summary[f"{task}/{metric_name}"] = metric_value
-        for summary_metric, summary_value in wandb_summary.items():
-            _task, _summary_metric = summary_metric.split("/")
-            _results[_task].pop(_summary_metric)
-        tmp_results = copy.deepcopy(_results)
-        for task_name, task_results in tmp_results.items():
-            for metric_name, metric_value in task_results.items():
-                _results[f"{task_name}/{metric_name}"] = metric_value
-                _results[task_name].pop(metric_name)
-        for task in self.task_names:
-            _results.pop(task)
-        return wandb_summary, _results
-    def _log_results_as_table(self) -> None:
-        """Generate and log evaluation results as a table to W&B."""
-        columns = [
-            "Version",
-            "Filter",
-            "num_fewshot",
-            "Metric",
-            "Value",
-            "Stderr",
-        ]
-        def make_table(columns: List[str], key: str = "results"):
-            import wandb
-            table = wandb.Table(columns=columns)
-            results = copy.deepcopy(self.results)
-            for k, dic in results.get(key).items():
-                if k in self.group_names and not key == "groups":
-                    continue
-                version = results.get("versions").get(k)
-                if version == "N/A":
-                    version = None
-                n = results.get("n-shot").get(k)
-                for (mf), v in dic.items():
-                    m, _, f = mf.partition(",")
-                    if m.endswith("_stderr"):
-                        continue
-                    if m == "alias":
-                        continue
-                    if m + "_stderr" + "," + f in dic:
-                        se = dic[m + "_stderr" + "," + f]
-                        if se != "N/A":
-                            se = "%.4f" % se
-                        table.add_data(*[k, version, f, n, m, str(v), str(se)])
-                    else:
-                        table.add_data(*[k, version, f, n, m, str(v), ""])
-            return table
-        # log the complete eval result to W&B Table
-        table = make_table(["Tasks"] + columns, "results")
-        self.run.log({"evaluation/eval_results": table}, step=self.step)
-        if "groups" in self.results.keys():
-            table = make_table(["Groups"] + columns, "groups")
-            self.run.log({"evaluation/group_eval_results": table}, step=self.step)
-    def _log_results_as_artifact(self) -> None:
-        """Log results as JSON artifact to W&B."""
-        import wandb
-        dumped = json.dumps(
-            self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
-        )
-        artifact = wandb.Artifact("results", type="eval_results")
-        with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
-            f.write(dumped)
-        self.run.log_artifact(artifact)
-    def log_eval_result(self) -> None:
-        """Log evaluation results to W&B."""
-        # Log configs to wandb
-        configs = self._get_config()
-        self.run.config.update(configs, allow_val_change=self.step is not None)
-        wandb_summary, self.wandb_results = self._sanitize_results_dict()
-        # update wandb.run.summary with items that were removed
-        self.run.summary.update(wandb_summary)
-        # Log the evaluation metrics to wandb
-        self.run.log(self.wandb_results, step=self.step)
-        # Log the evaluation metrics as W&B Table
-        self._log_results_as_table()
-        # Log the results dict as json to W&B Artifacts
-        self._log_results_as_artifact()
-    def _generate_dataset(
-        self, data: List[Dict[str, Any]], config: Dict[str, Any]
-    ) -> pd.DataFrame:
-        """Generate a dataset from evaluation data.
-        Args:
-            data (List[Dict[str, Any]]): The data to generate a dataset for.
-            config (Dict[str, Any]): The configuration of the task.
-        Returns:
-            pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
-        """
-        ids = [x["doc_id"] for x in data]
-        labels = [x["target"] for x in data]
-        instance = [""] * len(ids)
-        resps = [""] * len(ids)
-        filtered_resps = [""] * len(ids)
-        model_outputs = {}
-        metrics_list = config["metric_list"]
-        metrics = {}
-        for metric in metrics_list:
-            metric = metric.get("metric")
-            if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
-                metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
-                if metric in ["byte_perplexity", "bits_per_byte"]:
-                    metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
-                else:
-                    metrics[f"{metric}_words"] = [x[metric][1] for x in data]
-            else:
-                metrics[metric] = [x[metric] for x in data]
-        if config["output_type"] == "loglikelihood":
-            instance = [x["arguments"][0][0] for x in data]
-            labels = [x["arguments"][0][1] for x in data]
-            resps = [
-                f"log probability of continuation is {x['resps'][0][0][0]} "
-                + "\n\n"
-                + "continuation will {} generated with greedy sampling".format(
-                    "not be" if not x["resps"][0][0][1] else "be"
-                )
-                for x in data
-            ]
-            filtered_resps = [
-                f"log probability of continuation is {x['filtered_resps'][0][0]} "
-                + "\n\n"
-                + "continuation will {} generated with greedy sampling".format(
-                    "not be" if not x["filtered_resps"][0][1] else "be"
-                )
-                for x in data
-            ]
-        elif config["output_type"] == "multiple_choice":
-            instance = [x["arguments"][0][0] for x in data]
-            choices = [
-                "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
-                for x in data
-            ]
-            resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
-            filtered_resps = [
-                np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
-            ]
-        elif config["output_type"] == "loglikelihood_rolling":
-            instance = [x["arguments"][0][0] for x in data]
-            resps = [x["resps"][0][0] for x in data]
-            filtered_resps = [x["filtered_resps"][0] for x in data]
-        elif config["output_type"] == "generate_until":
-            instance = [x["arguments"][0][0] for x in data]
-            resps = [x["resps"][0][0] for x in data]
-            filtered_resps = [x["filtered_resps"][0] for x in data]
-        model_outputs["raw_predictions"] = resps
-        model_outputs["filtered_predictions"] = filtered_resps
-        df_data = {
-            "id": ids,
-            "data": instance,
-        }
-        if config["output_type"] == "multiple_choice":
-            df_data["choices"] = choices
-        tmp_data = {
-            "input_len": [len(x) for x in instance],
-            "labels": labels,
-            "output_type": config["output_type"],
-        }
-        df_data.update(tmp_data)
-        df_data.update(model_outputs)
-        df_data.update(metrics)
-        return pd.DataFrame(df_data)
-    def _log_samples_as_artifact(
-        self, data: List[Dict[str, Any]], task_name: str
-    ) -> None:
-        import wandb
-        # log the samples as an artifact
-        dumped = json.dumps(
-            data,
-            indent=2,
-            default=_handle_non_serializable,
-            ensure_ascii=False,
-        )
-        artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
-        with artifact.new_file(
-            f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
-        ) as f:
-            f.write(dumped)
-        self.run.log_artifact(artifact)
-        # artifact.wait()
-    def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
-        """Log evaluation samples to W&B.
-        Args:
-            samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
-        """
-        task_names: List[str] = [
-            x for x in self.task_names if x not in self.group_names
-        ]
-        ungrouped_tasks = []
-        tasks_by_groups = {}
-        for task_name in task_names:
-            group_names = self.task_configs[task_name].get("group", None)
-            if group_names:
-                if isinstance(group_names, str):
-                    group_names = [group_names]
-                for group_name in group_names:
-                    if not tasks_by_groups.get(group_name):
-                        tasks_by_groups[group_name] = [task_name]
-                    else:
-                        tasks_by_groups[group_name].append(task_name)
-            else:
-                ungrouped_tasks.append(task_name)
-        for task_name in ungrouped_tasks:
-            eval_preds = samples[task_name]
-            # log the samples as a W&B Table
-            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
-            self.run.log({f"{task_name}_eval_results": df}, step=self.step)
-            # log the samples as a json file as W&B Artifact
-            self._log_samples_as_artifact(eval_preds, task_name)
-        for group, grouped_tasks in tasks_by_groups.items():
-            grouped_df = pd.DataFrame()
-            for task_name in grouped_tasks:
-                eval_preds = samples[task_name]
-                df = self._generate_dataset(
-                    eval_preds, self.task_configs.get(task_name)
-                )
-                df["group"] = group
-                df["task"] = task_name
-                grouped_df = pd.concat([grouped_df, df], ignore_index=True)
-                # log the samples as a json file as W&B Artifact
-                self._log_samples_as_artifact(eval_preds, task_name)
-            self.run.log({f"{group}_eval_results": grouped_df}, step=self.step)

lm-evaluation-harness/lm_eval/models/__init__.py DELETED Viewed

@@ -1,36 +0,0 @@
-from . import (
-    anthropic_llms,
-    api_models,
-    dummy,
-    gguf,
-    hf_audiolm,
-    hf_steered,
-    hf_vlms,
-    huggingface,
-    ibm_watsonx_ai,
-    mamba_lm,
-    nemo_lm,
-    neuralmagic,
-    neuron_optimum,
-    openai_completions,
-    optimum_ipex,
-    optimum_lm,
-    sglang_causallms,
-    sglang_generate_API,
-    textsynth,
-    vllm_causallms,
-    vllm_vlms,
-)
-# TODO: implement __all__
-try:
-    # enable hf hub transfer if available
-    import hf_transfer  # type: ignore # noqa
-    import huggingface_hub.constants  # type: ignore
-    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
-except ImportError:
-    pass

lm-evaluation-harness/lm_eval/models/anthropic_llms.py DELETED Viewed

@@ -1,367 +0,0 @@
-import logging
-import os
-from functools import cached_property
-from typing import Any, Dict, List, Tuple, Union
-from tqdm import tqdm
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-from lm_eval.models.openai_completions import LocalCompletionsAPI
-from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
-eval_logger = logging.getLogger(__name__)
-def anthropic_completion(
-    client,  #: anthropic.Anthropic,
-    model: str,
-    prompt: str,
-    max_tokens_to_sample: int,
-    temperature: float,
-    stop: List[str],
-    **kwargs: Any,
-) -> str:
-    """Wrapper function around the Anthropic completion API client with exponential back-off
-    in case of RateLimitError.
-    params:
-        client: anthropic.Anthropic
-            Anthropic API client
-        model: str
-            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
-        prompt: str
-            Prompt to feed to the model
-        max_tokens_to_sample: int
-            Maximum number of tokens to sample from the model
-        temperature: float
-            Sampling temperature
-        stop: List[str]
-            List of stop sequences
-        kwargs: Any
-            Additional model_args to pass to the API client
-    """
-    try:
-        import anthropic
-    except ModuleNotFoundError as exception:
-        raise type(exception)(
-            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-        )
-    def _exception_callback(e: Exception, sleep_time: float) -> None:
-        eval_logger.warning(
-            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
-        )
-    @retry_on_specific_exceptions(
-        on_exceptions=[anthropic.RateLimitError],
-        max_retries=None,  # retry forever, consider changing
-        on_exception_callback=_exception_callback,
-    )
-    def completion():
-        response = client.completions.create(
-            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
-            model=model,
-            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
-            #       (e.g. gsm8k's ":") may truncate a lot of the input.
-            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
-            max_tokens_to_sample=max_tokens_to_sample,
-            temperature=temperature,
-            **kwargs,
-        )
-        return response.completion
-    return completion()
-def anthropic_chat(
-    client,  #: anthropic.Anthropic,
-    model: str,
-    prompt: str,
-    max_tokens: int,
-    temperature: float,
-    stop: List[str],
-    **kwargs: Any,
-) -> str:
-    """Wrapper function around the Anthropic completion API client with exponential back-off
-    in case of RateLimitError.
-    params:
-        client: anthropic.Anthropic
-            Anthropic API client
-        model: str
-            Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
-        prompt: str
-            Prompt to feed to the model
-        max_tokens: int
-            Maximum number of tokens to sample from the model
-        temperature: float
-            Sampling temperature
-        stop: List[str]
-            List of stop sequences
-        kwargs: Any
-            Additional model_args to pass to the API client
-    """
-    try:
-        import anthropic
-    except ModuleNotFoundError as exception:
-        raise type(exception)(
-            "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-        )
-    def _exception_callback(e: Exception, sleep_time: float) -> None:
-        eval_logger.warning(
-            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
-        )
-    @retry_on_specific_exceptions(
-        on_exceptions=[
-            anthropic.RateLimitError,
-            anthropic.APIConnectionError,
-            anthropic.APIStatusError,
-        ],
-        max_retries=None,  # retry forever, consider changing
-        on_exception_callback=_exception_callback,
-    )
-    def messages():
-        response = client.messages.create(
-            model=model,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            messages=[{"role": "user", "content": f"{prompt}"}],
-            **kwargs,
-        )
-        return response.content[0].text
-    return messages()
-@register_model("anthropic-completions")
-class AnthropicLM(LM):
-    REQ_CHUNK_SIZE = 20  # TODO: not used
-    def __init__(
-        self,
-        batch_size: int = 1,
-        model: str = "claude-2.0",
-        max_tokens_to_sample: int = 256,
-        temperature: float = 0,  # defaults to 1
-        **kwargs,  # top_p, top_k, etc.
-    ) -> None:
-        """Anthropic API wrapper.
-        :param model: str
-            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
-        :param max_tokens_to_sample: int
-            Maximum number of tokens to sample from the model
-        :param temperature: float
-            Sampling temperature
-        :param kwargs: Any
-            Additional model_args to pass to the API client
-        """
-        super().__init__()
-        try:
-            import anthropic
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-            )
-        self.model = model
-        # defaults to os.environ.get("ANTHROPIC_API_KEY")
-        self.client = anthropic.Anthropic()
-        self.temperature = temperature
-        self.max_tokens_to_sample = max_tokens_to_sample
-        self.tokenizer = self.client.get_tokenizer()
-        self.kwargs = kwargs
-    @property
-    def eot_token_id(self):
-        # Not sure but anthropic.HUMAN_PROMPT ?
-        raise NotImplementedError("No idea about anthropic tokenization.")
-    @property
-    def max_length(self) -> int:
-        return 2048
-    @property
-    def max_gen_toks(self) -> int:
-        return self.max_tokens_to_sample
-    @property
-    def batch_size(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError("No support for logits.")
-    @property
-    def device(self):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError("No support for logits.")
-    def tok_encode(self, string: str) -> List[int]:
-        return self.tokenizer.encode(string).ids
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
-    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
-        try:
-            import anthropic
-        except ModuleNotFoundError as exception:
-            raise type(exception)(
-                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
-please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
-            )
-        if not requests:
-            return []
-        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
-        res = []
-        for request in tqdm(_requests, disable=disable_tqdm):
-            try:
-                inp = request[0]
-                request_args = request[1]
-                # generation_kwargs
-                until = request_args.get("until")
-                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
-                temperature = request_args.get("temperature", self.temperature)
-                response = anthropic_completion(
-                    client=self.client,
-                    model=self.model,
-                    prompt=inp,
-                    max_tokens_to_sample=max_gen_toks,
-                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
-                    stop=until,  # type: ignore
-                    **self.kwargs,
-                )
-                res.append(response)
-                self.cache_hook.add_partial("generate_until", request, response)
-            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"Server unreachable: {e.__cause__}")
-                break
-            except anthropic.APIStatusError as e:  # type: ignore # noqa: F821
-                eval_logger.critical(f"API error {e.status_code}: {e.message}")
-                break
-        return res
-    def _model_call(self, inps):
-        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
-    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override generate_until
-        raise NotImplementedError()
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError("No support for logits.")
-@register_model("anthropic-chat", "anthropic-chat-completions")
-class AnthropicChat(LocalCompletionsAPI):
-    def __init__(
-        self,
-        base_url="https://api.anthropic.com/v1/messages",
-        tokenizer_backend=None,
-        **kwargs,
-    ):
-        super().__init__(
-            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
-        )
-        eval_logger.warning(
-            "Chat completions does not support batching. Defaulting to batch size 1."
-        )
-        self._batch_size = 1
-        self.anthropic_version = "2023-06-01"
-        eval_logger.warning(
-            f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
-        )
-    @cached_property
-    def api_key(self):
-        """Override this property to return the API key for the API request."""
-        key = os.environ.get("ANTHROPIC_API_KEY", None)
-        if key is None:
-            raise ValueError(
-                "API key not found. Please set the ANTHROPIC_API_KEY environment variable."
-            )
-        return key
-    @cached_property
-    def header(self):
-        return {
-            "x-api-key": f"{self.api_key}",
-            "anthropic-version": self.anthropic_version,
-        }
-    def _create_payload(
-        self,
-        messages: List[Dict],
-        generate=True,
-        gen_kwargs: dict = None,
-        eos="\n\nHuman:",
-        **kwargs,
-    ) -> dict:
-        system = (
-            messages[0].get("content") if messages[0].get("role") == "system" else None
-        )
-        if system:
-            messages = messages[1:]
-        gen_kwargs.pop("do_sample", False)
-        max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
-        temperature = gen_kwargs.pop("temperature", 0)
-        stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos)
-        if not isinstance(stop, list):
-            stop = [stop]
-        out = {
-            "messages": messages,
-            "model": self.model,
-            "max_tokens": max_tokens,
-            "temperature": temperature,
-            "stop_sequences": stop,
-            **gen_kwargs,
-        }
-        if system:
-            out["system"] = system
-        return out
-    def parse_generations(
-        self, outputs: Union[Dict, List[Dict]], **kwargs
-    ) -> List[str]:
-        res = []
-        if not isinstance(outputs, list):
-            outputs = [outputs]
-        for out in outputs:
-            for choices in out["content"]:
-                res.append(choices["text"])
-        return res
-    def tok_encode(
-        self,
-        string: str,
-        left_truncate_len=None,
-        add_special_tokens=None,
-        **kwargs,
-    ) -> List[str]:
-        return [string]
-    def loglikelihood(self, requests, **kwargs):
-        raise NotImplementedError(
-            "Anthropic Chat Completions API does not support the return of loglikelihood"
-        )

lm-evaluation-harness/lm_eval/models/api_models.py DELETED Viewed

@@ -1,799 +0,0 @@
-import abc
-import asyncio
-import copy
-import itertools
-import json
-import logging
-from functools import cached_property
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Awaitable,
-    Callable,
-    Dict,
-    Iterable,
-    List,
-    Literal,
-    NamedTuple,
-    Optional,
-    Tuple,
-    Union,
-)
-try:
-    import requests
-    from aiohttp import ClientSession, ClientTimeout, TCPConnector
-    from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
-    from tqdm import tqdm
-    from tqdm.asyncio import tqdm_asyncio
-except ModuleNotFoundError:
-    pass
-import base64
-from importlib.util import find_spec
-from io import BytesIO
-from lm_eval import utils
-from lm_eval.api.instance import Instance
-from lm_eval.api.model import TemplateLM
-from lm_eval.models.utils import Collator, chunks, configure_pad_token
-if TYPE_CHECKING:
-    from PIL import Image
-eval_logger = logging.getLogger(__name__)
-LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
-# utility class to keep track of json encoded chats
-class JsonChatStr(NamedTuple):
-    prompt: str
-    def encode(self, encoding):
-        return self.prompt.encode(encoding)
-def create_image_prompt(
-    imgs: list["Image.Image"], chat: dict, fmt: str = "PNG"
-) -> dict:
-    """
-    Parameters
-    ----------
-    img : list[PIL.Image.Image]
-        The list of images to encode to base64
-    chat : dict
-    fmt : str, optional
-        Any format Pillow understands (e.g. "PNG", "JPEG").
-        Defaults to "PNG".
-    Returns
-    -------
-    dict
-    """
-    images = []
-    for img in imgs:
-        buf = BytesIO()
-        img.save(buf, format=fmt)
-        img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
-        img_dict = {
-            "type": "image_url",
-            "image_url": {"url": f"data:image/png;base64,{img_b64}", "detail": "auto"},
-        }
-        images.append(img_dict)
-    # chat is in format of list[dict["role": "user"/"system", "content": str, "type": "text"],...]
-    # with images, we need "content" to be a list of dicts with "type" and "text"/"image_url"
-    # currently we do not support few-shots so only one user message
-    # text content also has <image> placeholders, which apparently is not necessary for API class (confirm)
-    if isinstance(chat[-1]["content"], list):
-        chat[-1]["content"] = images + chat[-1]["content"]
-    else:
-        text_content = {"type": "text", "text": chat[-1]["content"]}
-        chat[-1]["content"] = images + [text_content]
-    chat[-1].pop("type")
-    return chat
-class TemplateAPI(TemplateLM):
-    MULTIMODAL = True
-    def __init__(
-        self,
-        model: str = None,
-        pretrained: str = None,  # `model` takes precedence over `pretrained` when passed.
-        base_url: str = None,
-        tokenizer: Optional[str] = None,
-        # Loglikelihood tasks require a tokenizer to calculate context lengths,
-        # however the requests can be sent as a string if the API doesn't support token inputs.
-        # use tokenized_requests=False
-        tokenizer_backend: Optional[
-            Literal["tiktoken", "huggingface", "None", "none"]
-        ] = "huggingface",
-        truncate: bool = False,
-        # number of concurrent requests. More useful if not batching
-        num_concurrent: int = 1,
-        max_retries: int = 3,
-        max_gen_toks: int = 256,
-        batch_size: Union[str, int] = 1,
-        seed: int = 1234,
-        max_length: Optional[int] = 2048,
-        add_bos_token: bool = False,
-        custom_prefix_token_id: int = None,
-        # send the requests as tokens or strings
-        tokenized_requests: bool = True,
-        trust_remote_code: bool = False,
-        revision: Optional[str] = "main",
-        use_fast_tokenizer: bool = True,
-        verify_certificate: bool = True,
-        eos_string: str = None,
-        # timeout in seconds
-        timeout: int = 300,
-        max_images: int = 1,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        missing_packages = [
-            pkg
-            for pkg in ["aiohttp", "tqdm", "tenacity", "requests"]
-            if find_spec(pkg) is None
-        ]
-        if missing_packages:
-            raise ModuleNotFoundError(
-                f"Attempted to use an API model, but the required packages {missing_packages} are not installed. "
-                'Please install these via `pip install lm-eval[api]` or `pip install -e ."[api]"`'
-            )
-        self.model = model or pretrained
-        self.base_url = base_url
-        self.tokenizer = tokenizer
-        if not isinstance(batch_size, int) and "auto" in batch_size:
-            eval_logger.warning(
-                "Automatic batch size is not supported for API models. Defaulting to batch size 1."
-            )
-        elif int(batch_size) > 1:
-            eval_logger.warning(
-                "Batch size > 1 detected. Ensure your API supports batched requests with varying total sequence lengths."
-            )
-        self._batch_size = int(batch_size) if batch_size != "auto" else 1
-        self._truncate = truncate
-        self._max_gen_toks = int(max_gen_toks)
-        self._seed = int(seed)
-        # max_length - 1 as we always have 1 token for generation
-        eval_logger.info(f"Using max length {max_length} - 1")
-        self.max_length = max_length - 1
-        if int(num_concurrent) <= 1:
-            eval_logger.info(
-                "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
-            )
-        self._concurrent = int(num_concurrent)
-        self.tokenizer_backend = (
-            None if tokenizer_backend in ("None", "none") else tokenizer_backend
-        )
-        self.add_bos_token = add_bos_token
-        self.custom_prefix_token_id = custom_prefix_token_id
-        self.tokenized_requests = tokenized_requests
-        self.max_retries = int(max_retries)
-        self.verify_certificate = verify_certificate
-        self._eos_string = eos_string
-        self.timeout = int(timeout)
-        self.max_images = int(max_images)
-        eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
-        if self.tokenizer_backend is None:
-            self.tokenizer = None
-            self.tokenized_requests = False
-        else:
-            if self.tokenizer is None:
-                if self.tokenizer_backend == "huggingface":
-                    import transformers
-                    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                        self.tokenizer if self.tokenizer else self.model,
-                        trust_remote_code=trust_remote_code,
-                        revision=revision,
-                        use_fast=use_fast_tokenizer,
-                    )
-                    # Not used as the API will handle padding but to mirror the behavior of the HFLM
-                    self.tokenizer = configure_pad_token(self.tokenizer)
-                elif self.tokenizer_backend == "tiktoken":
-                    try:
-                        import tiktoken
-                        self.tokenizer = tiktoken.encoding_for_model(self.model)
-                    except ModuleNotFoundError as e:
-                        raise ModuleNotFoundError(
-                            "Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
-                            "Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
-                        ) from e
-                    if "openai" not in self.base_url:
-                        eval_logger.warning(
-                            f"Passed `base_url={self.base_url}` but using (OpenAI) Tiktoken tokenizer backend. "
-                            "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
-                        )
-            else:
-                import transformers
-                assert isinstance(tokenizer, str), "tokenizer must be a string"
-                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    tokenizer,
-                    trust_remote_code=trust_remote_code,
-                    revision=revision,
-                    use_fast=use_fast_tokenizer,
-                )
-    @abc.abstractmethod
-    def _create_payload(
-        self,
-        messages: Union[List[List[int]], List[dict], List[str], str],
-        *,
-        generate: bool = True,
-        gen_kwargs: Optional[dict] = None,
-        seed: int = 1234,
-        eos: str = None,
-        **kwargs,
-    ) -> dict:
-        """This method is responsible for creating the json payload that will be sent to the API."""
-        raise NotImplementedError
-    def create_message(
-        self,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
-        generate=False,
-    ) -> Union[List[List[int]], List[dict], List[str], str]:
-        """Helper method to transform the prompt into the expected API input format. messages consist of batched requests"""
-        if isinstance(messages[0], JsonChatStr):
-            # for chat completions we need to decode the json string to list[dict,...]
-            assert self._batch_size == 1, (
-                "non-tokenized chat requests are only supported with batch_size=1"
-            )
-            # list[dict["role":..., "content":...],...]
-            return json.loads(messages[0].prompt)
-        if not self.tokenized_requests:
-            # if messages are tokenized:
-            if isinstance(messages[0][0], int):
-                # assuming decoding is lossless. However, this is only for loglikelihood requests
-                # as we need to compute the context length. For generations, we don't need to tokenize.
-                messages = self.decode_batch(messages)
-            if self._batch_size <= 1:
-                # if batch is 1 return str
-                return messages[0]
-            else:
-                # list[str,...]
-                return messages
-        # list[list[int], ...]
-        return messages
-    @staticmethod
-    @abc.abstractmethod
-    def parse_logprobs(
-        outputs: Union[Any, List[Any]],
-        tokens: List[List[int]] = None,
-        ctxlen: List[int] = None,
-        **kwargs,
-    ) -> List[Tuple[float, bool]]:
-        """Method used to parse the logprobs from the (batched) API response. This method should return a list of tuples"""
-        raise NotImplementedError
-    @staticmethod
-    @abc.abstractmethod
-    def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
-        """Method used to parse the generations from the (batched) API response. This method should return a list of str"""
-        raise NotImplementedError
-    @cached_property
-    def api_key(self) -> str:
-        """Override this property to return the API key for the API request."""
-        return ""
-    @cached_property
-    def header(self) -> dict:
-        """Override this property to return the headers for the API request."""
-        return {"Authorization": f"Bearer {self.api_key}"}
-    @property
-    def tokenizer_name(self) -> str:
-        """Must be defined for LM subclasses which implement Chat Templating.
-        Should return the name of the tokenizer or chat template used.
-        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
-        """
-        return ""
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
-    ) -> Union[str, JsonChatStr]:
-        """Applies a chat template to a list of chat history between user and model."""
-        if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
-            return self.tokenizer.apply_chat_template(
-                chat_history,
-                tokenize=False,
-                add_generation_prompt=add_generation_prompt,
-                continue_final_message=not add_generation_prompt,
-            )
-        else:
-            # bit of a hack. We'll load back before sending to the API
-            return JsonChatStr(
-                json.dumps(
-                    [{**item, "type": "text"} for item in chat_history],
-                    ensure_ascii=False,
-                )
-            )
-    @cached_property
-    def eot_token_id(self) -> Optional[int]:
-        if self.tokenizer is None:
-            return None
-        else:
-            if self.tokenizer_backend == "huggingface":
-                return self.tokenizer.eos_token_id
-            elif self.tokenizer_backend == "tiktoken":
-                return self.tokenizer.eot_token
-    @cached_property
-    def eos_string(self) -> Optional[str]:
-        if self._eos_string:
-            return self._eos_string
-        elif self.tokenizer is not None:
-            if self.tokenizer_backend == "huggingface":
-                return self.tokenizer.eos_token
-            elif self.tokenizer_backend == "tiktoken":
-                return self.tokenizer.decode([self.tokenizer.eot_token])
-        else:
-            eval_logger.warning(
-                "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
-            )
-            return None
-    @cached_property
-    def prefix_token_id(self) -> Optional[int]:
-        if self.tokenizer is None:
-            return None
-        else:
-            if self.custom_prefix_token_id is not None:
-                return self.custom_prefix_token_id
-            if self.tokenizer_backend == "huggingface":
-                if self.tokenizer.bos_token_id is not None:
-                    return self.tokenizer.bos_token_id
-                return self.tokenizer.eos_token_id
-            else:
-                return self.tokenizer.eot_token
-    def tok_encode(
-        self,
-        string: str,
-        left_truncate_len: int = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        **kwargs,
-    ) -> Union[List[List[int]], List[int], List[str]]:
-        if self.tokenizer_backend is None:
-            return [string]
-        elif self.tokenizer_backend == "huggingface":
-            # by default for CausalLM - false or self.add_bos_token is set
-            if not add_special_tokens:
-                add_special_tokens = False or self.add_bos_token
-            encoding: Union[List[List[int]], List[int]] = self.tokenizer(
-                string,
-                add_special_tokens=add_special_tokens,
-                truncation=truncation,
-                return_attention_mask=False,
-            ).input_ids
-            # left-truncate the encoded context to be at most `left_truncate_len` tokens long
-            if left_truncate_len:
-                if not isinstance(string, str):
-                    encoding = [enc[-left_truncate_len:] for enc in encoding]
-                else:
-                    encoding = encoding[-left_truncate_len:]
-            return encoding
-        else:
-            try:
-                encoding = self.tokenizer.encode(string)
-            except Exception:
-                encoding = self.tokenizer.encode_batch(string)
-            return encoding
-    def decode_batch(self, tokens: List[List[int]]) -> List[str]:
-        if self.tokenizer_backend == "huggingface":
-            return self.tokenizer.batch_decode(tokens)
-        elif self.tokenizer_backend == "tiktoken":
-            return self.tokenizer.decode_batch(tokens)
-    def model_call(
-        self,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
-        *,
-        generate: bool = True,
-        gen_kwargs: Optional[Dict] = None,
-        **kwargs,
-    ) -> Optional[dict]:
-        # !!! Copy: shared dict for each request, need new object !!!
-        gen_kwargs = copy.deepcopy(gen_kwargs)
-        try:
-            response = requests.post(
-                self.base_url,
-                json=self._create_payload(
-                    self.create_message(messages),
-                    generate=generate,
-                    gen_kwargs=gen_kwargs,
-                    seed=self._seed,
-                    eos=self.eos_string,
-                    **kwargs,
-                ),
-                headers=self.header,
-                verify=self.verify_certificate,
-            )
-            if not response.ok:
-                eval_logger.warning(
-                    f"API request failed with error message: {response.text}. Retrying..."
-                )
-            response.raise_for_status()
-            return response.json()
-        except RetryError:
-            eval_logger.error(
-                "API request failed after multiple retries. Please check the API status."
-            )
-            return None
-    async def amodel_call(
-        self,
-        session: ClientSession,
-        messages: Union[List[List[int]], List[str], List[JsonChatStr]],
-        *,
-        generate: bool = True,
-        cache_keys: list = None,
-        ctxlens: Optional[List[int]] = None,
-        gen_kwargs: Optional[Dict] = None,
-        **kwargs,
-    ) -> Union[List[str], List[Tuple[float, bool]], None]:
-        # !!! Copy: shared dict for each request, need new object !!!
-        gen_kwargs = copy.deepcopy(gen_kwargs)
-        payload = self._create_payload(
-            self.create_message(messages),
-            generate=generate,
-            gen_kwargs=gen_kwargs,
-            seed=self._seed,
-            **kwargs,
-        )
-        cache_method = "generate_until" if generate else "loglikelihood"
-        try:
-            async with session.post(
-                self.base_url,
-                json=payload,
-                headers=self.header,
-            ) as response:
-                if not response.ok:
-                    error_text = await response.text()
-                    eval_logger.warning(
-                        f"API request failed with error message: {error_text}. Retrying..."
-                    )
-                # raising exception will retry the request
-                response.raise_for_status()
-                outputs = await response.json()
-            answers = (
-                self.parse_generations(
-                    outputs=outputs,
-                )
-                if generate
-                else self.parse_logprobs(
-                    outputs=outputs,
-                    tokens=messages,
-                    ctxlens=ctxlens,
-                )
-            )
-            if cache_keys:
-                for res, cache in zip(answers, cache_keys):
-                    self.cache_hook.add_partial(cache_method, cache, res)
-            return answers
-        # If the retries also fail
-        except RetryError:
-            eval_logger.error(
-                "API request failed after multiple retries. Please check the API status."
-            )
-            return None
-    def batch_loglikelihood_requests(
-        self, chunks: Iterable[List[LogLikelihoodInputs]]
-    ) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
-        inputs = []
-        ctxlens = []
-        cache_keys = []
-        for chunk in chunks:
-            for cache_key, context_enc, continuation_enc in chunk:
-                # max_length - 1 as we always have 1 token for generation
-                inp = (context_enc + continuation_enc)[-self.max_length :]
-                if len(inp) < len(context_enc + continuation_enc):
-                    eval_logger.warning(
-                        f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context."
-                    )
-                ctxlen = len(context_enc) - max(
-                    0, len(context_enc) + len(continuation_enc) - self.max_length
-                )
-                inputs.append(inp)
-                ctxlens.append(ctxlen)
-                cache_keys.append(cache_key)
-        return inputs, ctxlens, cache_keys
-    async def get_batched_requests(
-        self,
-        requests: list,
-        cache_keys: list,
-        *,
-        generate: bool = True,
-        ctxlens: List[int] = None,
-        **kwargs,
-    ) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
-        ctxlens = ctxlens if ctxlens else [None] * len(requests)
-        conn = TCPConnector(limit=self._concurrent, ssl=self.verify_certificate)
-        async with ClientSession(
-            connector=conn, timeout=ClientTimeout(total=self.timeout)
-        ) as session:
-            retry_: Callable[..., Awaitable[Any]] = retry(
-                stop=stop_after_attempt(self.max_retries),
-                wait=wait_exponential(multiplier=0.5, min=1, max=10),
-                reraise=True,
-            )(self.amodel_call)
-            # Create tasks for each batch of request
-            tasks = [
-                asyncio.create_task(
-                    retry_(
-                        session=session,
-                        messages=message,
-                        cache_keys=cache_key,
-                        generate=generate,
-                        ctxlens=ctxlen,
-                        **kwargs,
-                    )
-                )
-                for message, cache_key, ctxlen in zip(
-                    chunks(requests, n=self._batch_size),
-                    chunks(cache_keys, n=self._batch_size),
-                    chunks(ctxlens, n=self._batch_size),
-                )
-            ]
-            return await tqdm_asyncio.gather(*tasks, desc="Requesting API")
-    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
-        assert self.tokenizer is not None, (
-            "Tokenizer is required for loglikelihood tasks to compute context lengths."
-        )
-        res = []
-        def _collate(req: LogLikelihoodInputs):
-            """Defines the key for the sorted method"""
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = req[1] + req[2]
-            return -len(toks), tuple(toks)
-        re_ord = Collator(
-            requests,
-            sort_fn=_collate,
-            group_by=None,
-        )
-        # if concurrent then we'll batch in the async context
-        chunked = re_ord.get_batched(n=self._batch_size if self._concurrent <= 1 else 0)
-        if self._concurrent <= 1:
-            pbar = tqdm(desc="Requesting API", total=len(requests))
-            for chunk in chunked:
-                inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
-                outputs = retry(
-                    stop=stop_after_attempt(self.max_retries),
-                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
-                    reraise=True,
-                )(self.model_call)(messages=inputs, generate=False)
-                if isinstance(outputs, dict):
-                    outputs = [outputs]
-                for answer_, cache_key in zip(
-                    self.parse_logprobs(
-                        outputs=outputs, tokens=inputs, ctxlens=ctxlens
-                    ),
-                    cache_keys,
-                ):
-                    if answer_ is not None:
-                        res.append(answer_)
-                        # cache requests that aren't from a loglikelihood_rolling request
-                        if cache_key is not None:
-                            self.cache_hook.add_partial(
-                                "loglikelihood", cache_key, answer_
-                            )
-                        pbar.update(1)
-        else:
-            inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
-            res = itertools.chain.from_iterable(
-                asyncio.run(
-                    self.get_batched_requests(
-                        inputs, cache_keys, generate=False, ctxlens=ctxlens
-                    )
-                )
-            )
-        return re_ord.get_original(res)
-    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
-        res = []
-        def _collate_gen(_requests):
-            # sort by the length of the non-tokenized contexts
-            return -len(_requests[0])
-        # Let the API deal with tokenization
-        if len(requests[0].args) > 2:
-            assert self.tokenizer is None, (
-                "tokenizer is not supported for multimodal requests yet!"
-            )
-            eval_logger.info(
-                f"Using max_images {self.max_images}. Set in the model args."
-            )
-            requests, all_gen_kwargs, auxiliary_args = zip(
-                *(req.args for req in requests)
-            )
-            requests = tuple(
-                JsonChatStr(
-                    json.dumps(
-                        create_image_prompt(
-                            y["visual"][: self.max_images], json.loads(x.prompt)
-                        )
-                    )
-                )
-                for x, y in zip(requests, auxiliary_args)
-            )
-        else:
-            requests, all_gen_kwargs = zip(*(req.args for req in requests))
-        if self.tokenized_requests:
-            encodings_list = self.tok_encode(
-                requests, add_special_tokens=self.add_bos_token
-            )
-        else:
-            encodings_list = [None] * len(requests)
-        requests = [
-            (a, b, c) for a, b, c in zip(requests, all_gen_kwargs, encodings_list)
-        ]
-        re_ord = Collator(
-            requests,
-            sort_fn=_collate_gen,
-            group_by="gen_kwargs",
-        )
-        chunked = re_ord.get_batched(
-            n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
-        )
-        if not self.tokenized_requests:
-            eval_logger.info(
-                "Tokenized requests are disabled. Context + generation length is not checked."
-            )
-        if self._concurrent <= 1:
-            pbar = tqdm(desc="Requesting API", total=len(requests))
-            for chunk in chunked:
-                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
-                if self.tokenized_requests:
-                    max_gen_toks = all_gen_kwargs[0].get(
-                        "max_gen_toks", self._max_gen_toks
-                    )
-                    max_context_len = self.max_length - max_gen_toks
-                    encodings_list = [x[-max_context_len:] for x in encodings_list]
-                    if any(
-                        len(x) + max_gen_toks > self.max_length for x in encodings_list
-                    ):
-                        eval_logger.warning(
-                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
-                        )
-                req = encodings_list if self.tokenized_requests else contexts
-                outputs = retry(
-                    stop=stop_after_attempt(self.max_retries),
-                    wait=wait_exponential(multiplier=0.5, min=1, max=10),
-                    reraise=True,
-                )(self.model_call)(
-                    messages=req,
-                    generate=True,
-                    gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
-                )
-                for generated_text, context in zip(
-                    self.parse_generations(
-                        outputs=outputs,
-                        contexts=contexts,
-                    ),
-                    contexts,
-                ):
-                    if generated_text is not None:
-                        res.append(generated_text)
-                        # partial caching
-                        if context is not None:
-                            self.cache_hook.add_partial(
-                                "generate_until",
-                                (context, all_gen_kwargs[0]),
-                                generated_text,
-                            )
-                            pbar.update(1)
-        else:
-            for chunk in chunked:
-                contexts, all_gen_kwargs, encodings_list = zip(*chunk)
-                if self.tokenized_requests:
-                    max_gen_toks = all_gen_kwargs[0].get(
-                        "max_gen_toks", self._max_gen_toks
-                    )
-                    max_context_len = self.max_length - max_gen_toks
-                    encodings_list = [x[-max_context_len:] for x in encodings_list]
-                    if any(
-                        len(x) + max_gen_toks > self.max_length for x in encodings_list
-                    ):
-                        eval_logger.warning(
-                            f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
-                        )
-                req = encodings_list if self.tokenized_requests else contexts
-                results = itertools.chain.from_iterable(
-                    asyncio.run(
-                        self.get_batched_requests(
-                            req,
-                            cache_keys=[(ctx, all_gen_kwargs[0]) for ctx in contexts],
-                            generate=True,
-                            gen_kwargs=copy.deepcopy(all_gen_kwargs[0]),
-                        )
-                    )
-                )
-                res.extend(results)
-        return re_ord.get_original(res)
-    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
-        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
-            rolling_token_windows = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.prefix_token_id,
-                        # max_seq_len - (1 for context)
-                        max_seq_len=self.max_length - 1,
-                        context_len=1,
-                    ),
-                )
-            )
-            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
-            rolling_token_windows = [(None,) + x for x in rolling_token_windows]
-            string_nll = self._loglikelihood_tokens(
-                rolling_token_windows,
-                disable_tqdm=True,
-            )
-            # discard is_greedy
-            string_nll = [x[0] for x in string_nll]
-            string_nll = sum(string_nll)
-            loglikelihoods.append(string_nll)
-            # cache this loglikelihood_rolling request
-            self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
-        return loglikelihoods

lm-evaluation-harness/lm_eval/models/dummy.py DELETED Viewed

@@ -1,41 +0,0 @@
-import random
-from tqdm import tqdm
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-@register_model("dummy")
-class DummyLM(LM):
-    def __init__(self) -> None:
-        super().__init__()
-    @classmethod
-    def create_from_arg_string(cls, arg_string, additional_config=None):
-        return cls()
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        res = []
-        for _ in tqdm(requests, disable=disable_tqdm):
-            res.append((-random.random(), False))
-        return res
-    def generate_until(self, requests, disable_tqdm: bool = False):
-        res = []
-        for request in tqdm(requests, disable=disable_tqdm):
-            res.append("lol")
-            assert request.arguments[0].strip() != ""
-        return res
-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        res = []
-        for _ in tqdm(requests, disable=disable_tqdm):
-            res.append(-random.random())
-        return res

lm-evaluation-harness/lm_eval/models/gguf.py DELETED Viewed

@@ -1,132 +0,0 @@
-import logging
-import time
-import requests
-from requests.exceptions import RequestException
-from tqdm import tqdm
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-logger = logging.getLogger(__name__)
-def get_result(logprobs, context_length):
-    is_greedy = True
-    offsets = logprobs["text_offset"]
-    tokens = logprobs["tokens"]
-    tokens_logprobs = logprobs["token_logprobs"]
-    idx = 0
-    while offsets[idx] < context_length:
-        idx += 1
-    continuation_logprobs = sum(tokens_logprobs[idx:-1])
-    for i in range(idx, len(tokens)):
-        token = tokens[i]
-        top_tokens = logprobs["top_logprobs"][i]
-        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
-        if top_token != token:
-            is_greedy = False
-            break
-    return continuation_logprobs, is_greedy
-@register_model("gguf", "ggml")
-class GGUFLM(LM):
-    def __init__(self, base_url=None, max_length=2048, **kwargs):
-        super().__init__()
-        self.base_url = base_url
-        assert self.base_url, "must pass `base_url` to use GGUF LM!"
-        self.logprobs = 10
-        self.temperature = 0.0
-        self.max_length = max_length
-    def gguf_completion(
-        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
-    ):
-        for _ in range(retries):
-            try:
-                prompt = context
-                request = {
-                    "prompt": prompt,
-                    "logprobs": self.logprobs,
-                    "temperature": self.temperature,
-                }
-                if continuation:
-                    prompt += continuation
-                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
-                if stop is not None:
-                    request["stop"] = stop
-                response = requests.post(
-                    f"{self.base_url}/v1/completions", json=request
-                )
-                response.raise_for_status()
-                return response.json()
-            except RequestException as e:
-                logger.error(f"RequestException: {e}")
-                time.sleep(delay)  # wait before retrying
-        else:
-            raise RuntimeError(
-                f"Failed to get a valid response after {retries} retries."
-            )
-    def loglikelihood(self, requests, disable_tqdm: bool = False):
-        if not requests:
-            return []
-        res = []
-        for context, continuation in tqdm(
-            [req.args for req in requests], disable=disable_tqdm
-        ):
-            response = self.gguf_completion(context=context, continuation=continuation)
-            if response and "choices" in response and response["choices"]:
-                choice = response["choices"][0]
-                logprobs = choice.get("logprobs")
-                if (
-                    logprobs
-                    and "token_logprobs" in logprobs
-                    and logprobs["token_logprobs"]
-                ):
-                    logprob, is_greedy = get_result(logprobs, len(context))
-                    res.append((logprob, is_greedy))
-                else:
-                    logger.warning(
-                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
-                    )
-            else:
-                logger.error(
-                    f"Invalid response for loglikelihood. Response: {response}"
-                )
-                assert False
-        return res
-    def generate_until(self, requests, disable_tqdm: bool = False):
-        if not requests:
-            return []
-        res = []
-        for request in tqdm([req.args for req in requests], disable=disable_tqdm):
-            inp = request[0]
-            request_args = request[1]
-            until = request_args.get("until", ["</s>"])
-            response = self.gguf_completion(context=inp, stop=until)
-            if response and "choices" in response and response["choices"]:
-                choice = response["choices"][0]
-                if "text" in choice:
-                    generated_text = choice["text"].strip()
-                    res.append(generated_text)
-                else:
-                    logger.error(
-                        f"Invalid response for greedy_until. Response: {response}"
-                    )
-                    res.append(None)  # Add default value in case of error
-            else:
-                logger.error(f"Invalid response for greedy_until. Response: {response}")
-                res.append(None)  # Add default value in case of error
-        return res
-    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
-        raise NotImplementedError(
-            "loglikelihood_rolling not yet supported for GGUF models"
-        )

lm-evaluation-harness/lm_eval/models/hf_audiolm.py DELETED Viewed

@@ -1,307 +0,0 @@
-import copy
-from typing import Dict, List, Optional, Tuple, Union
-import torch
-import transformers
-from tqdm import tqdm
-from transformers import BatchEncoding
-from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_model
-from lm_eval.models.huggingface import HFLM
-from lm_eval.models.utils import (
-    Collator,
-    replace_placeholders,
-    stop_sequences_criteria,
-)
-DEFAULT_AUDIO_PLACEHOLDERS = ["<audio>"]
-@register_model("hf-audiolm-qwen")
-class HFAUDIOLMQWEN(HFLM):
-    """
-    An abstracted Hugging Face model class for Audio LM model like Qwen2-Audio.
-    """
-    AUTO_MODEL_CLASS = transformers.Qwen2AudioForConditionalGeneration
-    MULTIMODAL = True  # flag to indicate, for now, that this model type can run multimodal requests
-    def __init__(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        max_audios: Optional[int] = 5,
-        **kwargs,
-    ):
-        # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
-        # modify init behavior.
-        super().__init__(pretrained, **kwargs)
-        self.max_audios = max_audios
-        self.chat_applied: bool = False
-    def _create_tokenizer(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.ProcessorMixin,
-            ]
-        ],
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        **kwargs,
-    ) -> None:
-        """
-        Helper method during initialization.
-        For the multimodal variant, we initialize not just
-        `self.tokenizer` but also `self.processor`.
-        """
-        if tokenizer:
-            if isinstance(tokenizer, str):
-                return transformers.AutoTokenizer.from_pretrained(
-                    tokenizer,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    # use_fast=use_fast_tokenizer,
-                )
-            else:
-                assert isinstance(
-                    tokenizer, transformers.ProcessorMixin
-                )  # TODO: check this condition
-                return tokenizer
-        # Get tokenizer based on 'pretrained'
-        if isinstance(pretrained, str):
-            model_name = pretrained
-        else:
-            # get the HF hub name via accessor on model
-            model_name = self.model.name_or_path
-        self.processor = transformers.AutoProcessor.from_pretrained(
-            model_name,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            # use_fast=use_fast_tokenizer,
-        )
-        self.tokenizer = self.processor.tokenizer
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
-    ) -> str:
-        """
-        Method to apply a chat template to a list of chat history between user and model.
-        """
-        chat_templated = self.processor.apply_chat_template(
-            chat_history, tokenize=False, add_generation_prompt=add_generation_prompt
-        )
-        return chat_templated
-    def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
-        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
-        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
-        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
-            generation_kwargs["do_sample"] = do_sample = False
-        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
-            generation_kwargs.pop("temperature")
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer,
-            stop,
-            inputs["input_ids"].shape[1],
-            inputs["input_ids"].shape[0],
-        )
-        return self.model.generate(
-            **inputs,
-            max_length=max_length,
-            stopping_criteria=stopping_criteria,
-            pad_token_id=self.tokenizer.pad_token_id,
-            use_cache=True,
-            **generation_kwargs,
-        )
-    def tok_batch_multimodal_encode(
-        self,
-        strings: List[str],  # note that input signature of this fn is different
-        audios: List[List],
-        padding_side: str = "left",
-        left_truncate_len: int = None,
-        truncation: bool = False,
-    ) -> Union[
-        BatchEncoding, Dict[str, torch.Tensor]
-    ]:  # note that this return signature differs from HFLM tok_batch_encode.
-        # NOTE: here, we replace <audio> tags with our model's corresponding image_token string value.
-        def _replace_placeholder(placeholder, strings):
-            return [
-                replace_placeholders(
-                    string,
-                    placeholder,
-                    "<|audio_bos|><|AUDIO|><|audio_eos|>",
-                    self.max_audios,
-                )
-                for string in strings
-            ]
-        if not self.chat_applied:
-            # TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
-            for placeholder in DEFAULT_AUDIO_PLACEHOLDERS:
-                strings = _replace_placeholder(placeholder, strings)
-        encoding = self.processor(
-            audios=audios,
-            text=strings,
-            padding=True,
-            return_tensors="pt",
-            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
-        )
-        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
-            self.device, self.model.dtype
-        )  # TODO: This only casts the pixel values. Should they always be float16?
-        return encoding
-    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
-        res = []
-        def _collate(x):
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = self.tok_encode(x[0])
-            return -len(toks), x[0]
-        pbar = tqdm(
-            total=len(requests),
-            disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running generate_until requests with text+audio input",
-        )
-        # TODO: port auto-batch sizing into this.
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        re_ords = Collator(
-            [reg.args for reg in requests],
-            _collate,
-            group_by="gen_kwargs",
-            group_fn=lambda x: x[1],
-        )
-        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
-        ### Up to here: was identical to non-multimodal HFLM generate_until ###
-        for chunk in chunks:
-            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
-            audios = []
-            for audio_lst_dict in aux_arguments:
-                for audio in audio_lst_dict["audio"]:
-                    audios.append(audio["array"])
-            if not isinstance(contexts, list):
-                contexts = list(
-                    contexts
-                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
-                # TODO: could we upstream this workaround to HF?
-            ### this part onward: same as HFLM ###
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            until = None
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                if "until" in kwargs.keys():
-                    until = kwargs.pop("until")
-                    if isinstance(until, str):
-                        until = [until]
-                    elif not isinstance(until, list):
-                        raise ValueError(
-                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                        )
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
-            # add EOS token to stop sequences
-            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
-            if not until:
-                until = [eos]
-            else:
-                until.append(eos)
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-            ## end stuff that's entirely copied verbatim from HFLM ###
-            max_ctx_len = self.max_length - max_gen_toks
-            inputs = self.tok_batch_multimodal_encode(
-                contexts,
-                audios,
-                left_truncate_len=max_ctx_len,
-                truncation=self.truncation,
-            )
-            context_enc = inputs["input_ids"]
-            if "max_length" not in kwargs:
-                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
-            inputs["input_ids"] = inputs["input_ids"].to("cuda")
-            inputs.input_ids = inputs.input_ids.to("cuda")
-            cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
-            del inputs
-            torch.cuda.empty_cache()
-            import gc
-            gc.collect()
-            ### essentially same as HFLM beyond this line!
-            cont_toks_list = cont.tolist()
-            for cont_toks, context in zip(cont_toks_list, contexts):
-                # discard context + left-padding toks if using causal decoder-only VLM
-                cont_toks = cont_toks[context_enc.shape[1] :]
-                s = self.tok_decode(cont_toks)
-                res.append(s)
-                self.cache_hook.add_partial(
-                    "generate_until", (context, gen_kwargs), s
-                )  # TODO: cache key for multimodal input should be what?
-                pbar.update(1)
-        # reorder this group of results back to original unsorted form
-        res = re_ords.get_original(res)
-        pbar.close()
-        return res
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
-        raise NotImplementedError(
-            "model type `hf-audiolm` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
-            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
-        )
-    def loglikelihood(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
-        raise NotImplementedError(
-            "'loglikelihood' requests for model type `hf-audiolm` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
-        )

lm-evaluation-harness/lm_eval/models/hf_steered.py DELETED Viewed

@@ -1,243 +0,0 @@
-from contextlib import contextmanager
-from functools import partial
-from pathlib import Path
-from typing import Any, Callable, Generator, Optional, Union
-import torch
-from peft.peft_model import PeftModel
-from torch import Tensor, nn
-from transformers import PreTrainedModel
-from lm_eval.api.registry import register_model
-from lm_eval.models.huggingface import HFLM
-@contextmanager
-def steer(
-    model: Union[PreTrainedModel, PeftModel], hook_to_steer: dict[str, Callable]
-) -> Generator[None, Any, None]:
-    """
-    Context manager that temporarily hooks models and steers them.
-    Args:
-        model: The transformer model to hook
-        hook_to_steer: Dictionary mapping hookpoints to steering functions
-    Yields:
-        None
-    """
-    def create_hook(hookpoint: str):
-        def hook_fn(module: nn.Module, input: Any, output: Tensor):
-            # If output is a tuple (like in some transformer layers), take first element
-            if isinstance(output, tuple):
-                output = (hook_to_steer[hookpoint](output[0]), *output[1:])  # type: ignore
-            else:
-                output = hook_to_steer[hookpoint](output)
-            return output
-        return hook_fn
-    handles = []
-    hookpoints = list(hook_to_steer.keys())
-    for name, module in model.base_model.named_modules():
-        if name in hookpoints:
-            handle = module.register_forward_hook(create_hook(name))
-            handles.append(handle)
-    if len(handles) != len(hookpoints):
-        raise ValueError(f"Not all hookpoints could be resolved: {hookpoints}")
-    try:
-        yield None
-    finally:
-        for handle in handles:
-            handle.remove()
-@register_model("steered")
-class SteeredModel(HFLM):
-    hook_to_steer: dict[str, Callable]
-    def __init__(
-        self,
-        pretrained: str,
-        steer_path: str,
-        device: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        HFLM with a steered forward pass.
-        To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
-        provide the path to a CSV file with the following columns (example rows are provided below):
-        loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,sae_id,description,
-        sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,
-        sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,layer_20/width_16k/canonical,increase dogs,
-        To load steering vectors directly, provide the path to a pytorch (.pt) file with content in the following format:
-        {
-            hookpoint: {
-                "steering_vector": <torch.Tensor>,
-                "steering_coefficient": <float>,
-                "action": <Literal["add", "clamp"]>,
-                "bias": <torch.Tensor | None>,
-            },
-            ...
-        }
-        """
-        super().__init__(pretrained=pretrained, device=device, **kwargs)
-        if steer_path.endswith(".pt") or steer_path.endswith(".pth"):
-            with open(steer_path, "rb") as f:
-                steer_config: dict[str, dict[str, Any]] = torch.load(
-                    f, weights_only=True
-                )
-        elif steer_path.endswith(".csv"):
-            steer_config = self.derive_steer_config(steer_path)
-        else:
-            raise ValueError(f"Unknown steer file type: {steer_path}")
-        hook_to_steer = {}
-        for hookpoint, steer_info in steer_config.items():
-            action = steer_info["action"]
-            steering_coefficient = steer_info["steering_coefficient"]
-            steering_vector = (
-                steer_info["steering_vector"].to(self.device).to(self.model.dtype)
-            )
-            bias = (
-                steer_info["bias"].to(self.device).to(self.model.dtype)
-                if steer_info["bias"] is not None
-                else None
-            )
-            if action == "add":
-                # Steers the model by adding some multiple of a steering vector to all sequence positions.
-                hook_to_steer[hookpoint] = (
-                    lambda acts: acts + steering_coefficient * steering_vector
-                )
-            elif action == "clamp":
-                hook_to_steer[hookpoint] = partial(
-                    self.clamp,
-                    steering_vector=steering_vector,
-                    value=steering_coefficient,
-                    bias=bias,
-                )
-            else:
-                raise ValueError(f"Unknown hook type: {action}")
-        self.hook_to_steer = hook_to_steer
-    @classmethod
-    def derive_steer_config(cls, steer_path: str):
-        """Derive a dictionary of steering vectors from sparse model(/s) specified in a CSV file."""
-        import pandas as pd
-        df = pd.read_csv(steer_path)
-        steer_data: dict[str, dict[str, Any]] = {}
-        if any(df["loader"] == "sparsify"):
-            from sparsify import SparseCoder
-        if any(df["loader"] == "sae_lens"):
-            from sae_lens import SAE
-            sae_cache = {}
-            def load_from_sae_lens(sae_release: str, sae_id: str):
-                cache_key = (sae_release, sae_id)
-                if cache_key not in sae_cache:
-                    sae_cache[cache_key] = SAE.from_pretrained(sae_release, sae_id)[0]
-                return sae_cache[cache_key]
-        for _, row in df.iterrows():
-            action = row.get("action", "add")
-            sparse_name = row["sparse_model"]
-            hookpoint = row["hookpoint"]
-            feature_index = int(row["feature_index"])
-            steering_coefficient = float(row["steering_coefficient"])
-            loader = row.get("loader", "sparsify")
-            if loader == "sparsify":
-                name_path = Path(sparse_name)
-                sparse_coder = (
-                    SparseCoder.load_from_disk(name_path / hookpoint)
-                    if name_path.exists()
-                    else SparseCoder.load_from_hub(sparse_name, hookpoint)
-                )
-                assert sparse_coder.W_dec is not None
-                steering_vector = sparse_coder.W_dec[feature_index]
-                bias = sparse_coder.b_dec
-            elif loader == "sae_lens":
-                sparse_coder = load_from_sae_lens(
-                    sae_release=sparse_name, sae_id=row["sae_id"]
-                )
-                steering_vector = sparse_coder.W_dec[feature_index]
-                bias = sparse_coder.b_dec
-                if hookpoint == "" or pd.isna(hookpoint):
-                    hookpoint = sparse_coder.cfg.hook_name
-            else:
-                raise ValueError(f"Unknown loader: {loader}")
-            steer_data[hookpoint] = {
-                "action": action,
-                "steering_coefficient": steering_coefficient,
-                "steering_vector": steering_vector,
-                "bias": bias,
-            }
-        return steer_data
-    @classmethod
-    def clamp(
-        cls,
-        acts: Tensor,
-        steering_vector: Tensor,
-        value: float,
-        bias: Optional[Tensor] = None,
-    ):
-        """Clamps a direction of the activations to be the steering vector * the value.
-        Args:
-            acts (Tensor): The activations tensor to edit of shape [batch, pos, features]
-            steering_vector (Tensor): A direction to clamp of shape [features]
-            value (float): Value to clamp the direction to
-            bias (Tensor | None): Optional bias to add to the activations
-        Returns:
-            Tensor: The modified activations with the specified direction clamped
-        """
-        if bias is not None:
-            acts = acts - bias
-        direction = steering_vector / torch.norm(steering_vector)
-        proj_magnitude = torch.sum(acts * direction, dim=-1, keepdim=True)
-        orthogonal_component = acts - proj_magnitude * direction
-        clamped = orthogonal_component + direction * value
-        if bias is not None:
-            return clamped + bias
-        return clamped
-    def forward(self, *args, **kwargs):
-        with torch.no_grad():
-            with steer(self.model, self.hook_to_steer):
-                return self.model.forward(*args, **kwargs)
-    def _model_call(self, *args, **kwargs):
-        with steer(self.model, self.hook_to_steer):
-            return super()._model_call(*args, **kwargs)
-    def _model_generate(self, *args, **kwargs):
-        with steer(self.model, self.hook_to_steer):
-            return super()._model_generate(*args, **kwargs)

lm-evaluation-harness/lm_eval/models/hf_vlms.py DELETED Viewed

@@ -1,757 +0,0 @@
-import copy
-import logging
-from typing import Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-import transformers
-from tqdm import tqdm
-from transformers import BatchEncoding
-from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_model
-from lm_eval.models.huggingface import HFLM
-from lm_eval.models.utils import (
-    Collator,
-    flatten_image_list,
-    handle_stop_sequences,
-    pad_and_concat,
-    replace_placeholders,
-    resize_image,
-    stop_sequences_criteria,
-)
-DEFAULT_IMAGE_PLACEHOLDER = "<image>"
-eval_logger = logging.getLogger(__name__)
-@register_model("hf-multimodal")
-class HFMultimodalLM(HFLM):
-    """
-    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
-    """
-    AUTO_MODEL_CLASS = transformers.AutoModelForVision2Seq
-    MULTIMODAL = True  # flag to indicate, for now, that this model type can run multimodal requests
-    def __init__(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        image_token_id: Optional[int] = None,
-        image_string: Optional[str] = None,
-        interleave: bool = True,
-        # TODO: handle whitespace in image placeholder (replacement)
-        max_images: Optional[int] = 999,
-        convert_img_format=False,
-        # For image resizing
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_width: Optional[int] = None,
-        image_height: Optional[int] = None,
-        image_max_side: Optional[int] = None,
-        **kwargs,
-    ):
-        self.image_width = image_width
-        self.image_height = image_height
-        self.image_max_side = image_max_side
-        if self.image_max_side and (self.image_width or self.image_height):
-            raise ValueError(
-                "Ambiguous config for image resize: you can not specify both "
-                "image_max_side and (image_width or image_height)"
-            )
-        # init pixels before calling tokenizer creation to avoid errors
-        self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
-            {"max_pixels": max_pixels} if max_pixels else {}
-        )
-        # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
-        # modify init behavior.
-        super().__init__(pretrained, **kwargs)
-        assert self.batch_size != "auto", (
-            "Batch size 'auto' is not yet supported for hf-multimodal models."
-        )
-        self.chat_applied: bool = False
-        # TODO: phi-3.5 "image placeholders" are <image_1>, <image_2>, ... in order. how to handle this case
-        # HF AutoModelForVision2Seq models have an `image_token_id` value in their configs
-        # denoting the token which indicates a location where an image will be substituted in.
-        # This can take different string values across models, e.g. <image> for Idefics2 and <|image_pad|> for Qwen2-VL
-        self.interleave = interleave
-        self.max_images = max_images
-        self.rgb = convert_img_format
-        # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
-        if not image_string:
-            self.image_token_id = (
-                int(image_token_id)
-                if image_token_id
-                else (
-                    getattr(self.config, "image_token_id", None)
-                    or getattr(self.config, "image_token_index", None)
-                )
-            )
-            assert self.image_token_id is not None, (
-                "Must have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one."
-            )
-            # get the string this token ID corresponds to
-            self.image_token = self.tok_decode(
-                [self.image_token_id], skip_special_tokens=False
-            )
-            if image_token_id is not None:
-                eval_logger.info(
-                    f"A non-default image_token_id with image_token_id={self.image_token_id} and string value '{self.image_token}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
-                )
-        else:
-            eval_logger.info(
-                f"A non-default image_token string with string value image_string='{image_string}' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!"
-            )
-            self.image_token = image_string
-    def _create_tokenizer(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.ProcessorMixin,
-            ]
-        ],
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        **kwargs,
-    ) -> None:
-        """
-        Helper method during initialization.
-        For the multimodal variant, we initialize not just
-        `self.tokenizer` but also `self.processor`.
-        """
-        if tokenizer:
-            if isinstance(tokenizer, str):
-                return transformers.AutoProcessor.from_pretrained(
-                    tokenizer,
-                    revision=revision,
-                    trust_remote_code=trust_remote_code,
-                    # use_fast=use_fast_tokenizer,
-                )
-            else:
-                assert isinstance(
-                    tokenizer, transformers.ProcessorMixin
-                )  # TODO: check this condition
-                return tokenizer
-        # Get tokenizer based on 'pretrained'
-        if isinstance(pretrained, str):
-            model_name = pretrained
-        else:
-            # get the HF hub name via accessor on model
-            model_name = self.model.name_or_path
-        self.processor = transformers.AutoProcessor.from_pretrained(
-            model_name,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            **self.pixels,
-            # use_fast=use_fast_tokenizer,
-        )
-        self.tokenizer = self.processor.tokenizer
-    def tok_multimodal_encode(
-        self, string, images, left_truncate_len=None, add_special_tokens=None
-    ):
-        """Helper function which encodes an image + string combo using AutoProcessor"""
-        # We inherit special token kwarg setup from HFLM.tok_encode
-        # special_tokens_kwargs = {}
-        # by default for CausalLM - false or self.add_bos_token is set
-        # if add_special_tokens is None:
-        #     special_tokens_kwargs = {"add_special_tokens": False or self.add_bos_token}
-        # otherwise the method explicitly defines the value
-        # else:
-        #     special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
-        # encode text+images
-        # TODO: why does (Qwen2-VL) processor error when attempting to add special tokens to text?
-        encoding = self.processor(
-            text=string, images=images, return_tensors=None
-        )  # , **special_tokens_kwargs)
-        # remove (and store) our tokenized text
-        text_encoding = encoding.pop("input_ids")
-        encoding.pop("attention_mask")
-        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
-        if left_truncate_len:
-            text_encoding = text_encoding[-left_truncate_len:]
-        return text_encoding, encoding  # image_encoding is a dict
-    def _encode_multimodal_pair(self, context, continuation, images):
-        """Helper function to perform the role of TemplateLM._encode_pair
-        Except allowing for image input to also be processed alongside `context`.
-        This method is a bit messy due to the need to defer conversion of image and text token input
-        into PyTorch tensors until the main inference loop.
-        """
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        # TODO: replace default <image> placeholder with self.image_token, for contexts
-        whole_enc, image_enc = self.tok_multimodal_encode(
-            context + continuation, images
-        )
-        context_enc, _ = self.tok_multimodal_encode(context, images)
-        # tok_multimodal_encode returns List[List[int]] for tokenized text. Get rid of the batch dim
-        # since we only are encoding a single string.
-        # TODO: this is a bit hacky, it'd be nice to make this generally cleaner
-        whole_enc, context_enc = whole_enc[0], context_enc[0]
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc, image_enc
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
-    ) -> str:
-        self.chat_applied = True
-        if not self.interleave:
-            for content in chat_history:
-                c = []
-                text = content["content"]
-                # Count and remove image placeholders
-                image_count = min(
-                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
-                )
-                text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
-                # Add image entries
-                for _ in range(image_count):
-                    c.append({"type": "image", "image": None})
-                # Add single text entry at the end
-                c.append({"type": "text", "text": text})
-                content["content"] = c
-        else:
-            for content in chat_history:
-                c = []
-                text = content["content"]
-                expected_image_count = min(
-                    self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
-                )
-                actual_image_count = 0
-                text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
-                for i, part in enumerate(text_parts):
-                    # TODO: concatenate text parts (esp. if skipping images)?
-                    if part:  # Add non-empty text parts
-                        c.append({"type": "text", "text": part})
-                    if (
-                        (i < len(text_parts) - 1) and i < self.max_images
-                    ):  # Add image placeholder after each split except the last
-                        c.append({"type": "image"})
-                        actual_image_count += 1
-                content["content"] = c
-                if actual_image_count != expected_image_count:
-                    raise ValueError(
-                        f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
-                    )
-        return self.processor.apply_chat_template(
-            chat_history,
-            add_generation_prompt=add_generation_prompt,
-            continue_final_message=not add_generation_prompt,
-        )
-    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
-        if hasattr(self.processor, "apply_chat_template"):
-            _tokenizer = self.tokenizer
-            self.tokenizer = self.processor
-            selected_template = super().chat_template(chat_template)
-            self.tokenizer = _tokenizer
-            return selected_template
-        else:
-            return super().chat_template(chat_template)
-    def tok_batch_multimodal_encode(
-        self,
-        strings: List[str],  # note that input signature of this fn is different
-        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
-        padding_side: str = "left",
-        left_truncate_len: int = None,
-        truncation: bool = False,
-    ) -> Union[
-        BatchEncoding, Dict[str, torch.Tensor]
-    ]:  # note that this return signature differs from HFLM tok_batch_encode.
-        # NOTE: here, we replace <image> tags with our model's corresponding image_token string value.
-        if not self.chat_applied:
-            # TODO<baber>: This still keeps the whitespace in the image placeholder, which is not ideal.
-            strings = [
-                replace_placeholders(
-                    string, DEFAULT_IMAGE_PLACEHOLDER, self.image_token, self.max_images
-                )
-                for string in strings
-            ]
-        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
-        old_padding_side = self.tokenizer.padding_side
-        self.tokenizer.padding_side = padding_side
-        # add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
-        images = [img[: self.max_images] for img in images]
-        if self.rgb:
-            images = [[img.convert("RGB") for img in sublist] for sublist in images]
-        # certain models like llava expect a single-level image list even for bs>1, multi-image. TODO: port this over to loglikelihoods
-        if getattr(self.config, "model_type", "") == "llava":
-            images = flatten_image_list(images)
-        encoding = self.processor(
-            images=images,
-            text=strings,
-            truncation=truncation,
-            padding="longest",
-            return_tensors="pt",
-            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
-        )
-        encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
-            self.device, self.model.dtype
-        )  # TODO: This only casts the pixel values. Should they always be float16?
-        if left_truncate_len:
-            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-            encoding["attention_mask"] = encoding["attention_mask"][
-                :, -left_truncate_len:
-            ]
-        self.tokenizer.padding_side = old_padding_side
-        return encoding
-    def _model_multimodal_call(self, inps, imgs, attn_mask=None, labels=None):
-        """
-        TODO: update docstring
-        """
-        # note: imgs is a dict.
-        with torch.no_grad():
-            return self.model(inps, **imgs).logits
-    def _model_multimodal_generate(self, inputs, max_length, stop, **generation_kwargs):
-        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
-        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
-        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
-            generation_kwargs["do_sample"] = do_sample = False
-        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
-            generation_kwargs.pop("temperature")
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer,
-            stop,
-            inputs["input_ids"].shape[1],
-            inputs["input_ids"].shape[0],
-        )
-        return self.model.generate(
-            **inputs,
-            max_length=max_length,
-            stopping_criteria=stopping_criteria,
-            pad_token_id=self.tokenizer.pad_token_id,
-            use_cache=True,
-            **generation_kwargs,
-        )
-    def _batch_images(self, image_encs):
-        """
-        Helper function: batch together image encodings across examples in a batch.
-        # TODO: for variable-sized images, this may break down.
-        """
-        batched_imgs = {}
-        for key in image_encs[0].keys():
-            batched_imgs[key] = torch.cat(
-                [
-                    torch.tensor(
-                        image_enc[key], device=self.device, dtype=self.model.dtype
-                    )
-                    for image_enc in image_encs
-                ],
-                dim=0,
-            )
-        return batched_imgs
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
-        if requests and len(requests[0].args) < 3:
-            # Fall back to non-multimodal generation.
-            return super().loglikelihood_rolling(requests=requests)
-        raise NotImplementedError(
-            "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
-            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
-        )
-    def loglikelihood(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[Tuple[float, bool]]:
-        if requests and len(requests[0].args) < 3:
-            # Fall back to non-multimodal generation.
-            return super().loglikelihood(requests=requests, disable_tqdm=disable_tqdm)
-        raise NotImplementedError(
-            "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
-        )
-        new_reqs = []
-        for context, continuation, aux_arguments in [req.args for req in requests]:
-            if context == "":
-                raise ValueError(
-                    "Must get non-empty context for multimodal requests! You might be trying to run 'loglikelihood_rolling', which is not supported in the multimodal case."
-                )
-            else:
-                visuals = aux_arguments["visual"]
-                context_enc, continuation_enc, image_enc = self._encode_multimodal_pair(
-                    context, continuation, visuals
-                )
-            # TODO: key to pick for caching images
-            new_reqs.append(
-                (
-                    (context, continuation, visuals),
-                    context_enc,
-                    continuation_enc,
-                    image_enc,
-                )
-            )
-        return self._multimodal_loglikelihood_tokens(
-            new_reqs, disable_tqdm=disable_tqdm
-        )
-    def _multimodal_loglikelihood_tokens(
-        self,
-        requests: List[
-            Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
-        ],  # TODO: update typehint to be correct
-        disable_tqdm: bool = False,
-        override_bs: int = None,
-    ) -> List[Tuple[float, bool]]:
-        res = []
-        # TODO: **improve multimodal collation.** We currently ignore image size when ordering docs. ideally we'd take them into account
-        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key for the sorted method"""
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = req[1] + req[2]
-            return -len(toks), tuple(toks)
-        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key to group and lookup one-token continuations"""
-            # Use with group_by="contexts" (optional)"
-            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
-            # speeds up some multiple-choice tasks proportionally to the number of choices.
-            # groups requests by context+continuation[:-1] and infer on one request/group.
-            return req[-1] + req[-3] + req[-2][:-1]
-        re_ord = Collator(
-            requests,
-            sort_fn=_collate,
-            group_by="contexts"  # TODO: can't group-by just "contexts" any more, need to incorporate imgs
-            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
-            and self.logits_cache
-            else None,
-            group_fn=_lookup_one_token_cont,
-        )
-        # automatic (variable) batch size detection for vectorization
-        # pull longest context sample from request
-        n_reordered_requests = len(re_ord)
-        batch_size = (
-            self.batch_size
-            if self.batch_size != "auto"
-            else override_bs
-            if override_bs is not None
-            else 0
-        )
-        batch_fn = (
-            self._batch_scheduler
-            if self.batch_size == "auto"
-            and n_reordered_requests > 0
-            and not override_bs
-            else None
-        )
-        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
-        pbar = tqdm(
-            total=len(requests),
-            disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running loglikelihood requests with text+image input",
-        )
-        for chunk in chunks:
-            imgs = []
-            inps = []
-            cont_toks_list = []
-            inplens = []
-            padding_len_inp = None
-            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-            # again because vectorizing is annoying
-            for _, context_enc, continuation_enc, image_enc in chunk:
-                # sanity check
-                assert len(image_enc) > 0
-                assert len(context_enc) > 0
-                assert len(continuation_enc) > 0
-                assert len(continuation_enc) <= self.max_length
-                # how this all works (illustrated on a causal decoder-only setup):
-                #          CTX      CONT
-                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
-                # model  \               \
-                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
-                # when too long to fit in context, truncate from the left
-                # TODO: assuming that we won't handle enc-dec Vision2Seq models. Is that a safe assumption?
-                inp = torch.tensor(
-                    (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
-                    dtype=torch.long,
-                    device=self.device,
-                )
-                (inplen,) = inp.shape
-                padding_len_inp = (
-                    max(padding_len_inp, inplen)
-                    if padding_len_inp is not None
-                    else inplen
-                )
-                inps.append(inp)  # [1, inp_length]
-                cont_toks_list.append(continuation_enc)
-                inplens.append(inplen)
-                imgs.append(image_enc)
-            # create encoder attn mask and batched conts, if seq2seq
-            call_kwargs = {}
-            batched_inps = pad_and_concat(
-                padding_len_inp, inps, padding_side="right"
-            )  # [batch, padding_len_inp]
-            # batch our examples' image inputs together
-            batched_imgs = self._batch_images(
-                imgs
-            )  # TODO: fix/test for bs>1 case with differently-sized imgs!
-            multi_logits = F.log_softmax(
-                self._model_multimodal_call(batched_inps, batched_imgs, **call_kwargs),
-                dim=-1,
-            )  # [batch, padding_length (inp or cont), vocab]
-            for (
-                request_str,
-                ctx_tokens,
-                _,
-                image_encs,
-            ), logits, inplen, cont_toks in zip(
-                chunk, multi_logits, inplens, cont_toks_list
-            ):
-                # Slice to original seq length
-                contlen = len(cont_toks)
-                # take only logits in the continuation
-                # (discard context toks if decoder-only ; discard right-padding)
-                # also discards + checks for "virtual tokens" in the causal LM's input window
-                # from prompt/prefix tuning tokens, if applicable
-                ctx_len = (
-                    inplen + (logits.shape[0] - padding_len_inp)
-                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
-                    else None
-                )
-                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
-                logits = logits.unsqueeze(0)  # [1, seq, vocab]
-                # Check if per-token argmax is exactly equal to continuation
-                greedy_tokens = logits.argmax(dim=-1)
-                # check for one-token continuation cache hits.
-                # noop in case group_by != "contexts" or no cache hit and returns the
-                # original args. Otherwise, expands the logits batch dimension and yields each
-                # batch along with matching continuation tokens and prompt strings.
-                # logits -> [1, seq, vocab]
-                for request_str, cont_toks, logits in re_ord.get_cache(
-                    req_str=request_str,
-                    cxt_toks=ctx_tokens,
-                    cont_toks=cont_toks,
-                    logits=logits,
-                ):
-                    cont_toks = torch.tensor(
-                        cont_toks, dtype=torch.long, device=self.device
-                    ).unsqueeze(0)  # [1, seq]
-                    max_equal = (greedy_tokens == cont_toks).all()
-                    # Obtain log-probs at the corresponding continuation token indices
-                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                        -1
-                    )  # [1, seq]
-                    # Answer: (log prob, is-exact-match)
-                    answer = (float(logits.sum()), bool(max_equal))
-                    res.append(answer)
-                    self.cache_hook.add_partial(
-                        "loglikelihood", request_str, answer
-                    )  # TODO: choose convention for adding images into the cache key
-                    pbar.update(1)
-        pbar.close()
-        return re_ord.get_original(res)
-    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
-        if requests and len(requests[0].args) < 3:
-            # Fall back to non-multimodal generation.
-            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
-        res = []
-        def _collate(x):
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = self.tok_encode(x[0])
-            return -len(toks), x[0]
-        pbar = tqdm(
-            total=len(requests),
-            disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running generate_until requests with text+image input",
-        )
-        # TODO: port auto-batch sizing into this.
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        re_ords = Collator(
-            [reg.args for reg in requests],
-            _collate,
-            group_by="gen_kwargs",
-            group_fn=lambda x: x[1],
-        )
-        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
-        ### Up to here: was identical to non-multimodal HFLM generate_until ###
-        eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
-        for chunk in chunks:
-            contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
-            visuals = [
-                [
-                    resize_image(
-                        img, self.image_width, self.image_height, self.image_max_side
-                    )
-                    for img in arg["visual"]
-                ]
-                for arg in aux_arguments
-            ]
-            if not isinstance(contexts, list):
-                contexts = list(
-                    contexts
-                )  # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
-                # TODO: could we upstream this workaround to HF?
-            ### this part onward: same as HFLM ###
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-            ### end stuff that's entirely copied verbatim from HFLM ###
-            max_ctx_len = self.max_length - max_gen_toks
-            inputs = self.tok_batch_multimodal_encode(
-                contexts,
-                visuals,
-                left_truncate_len=max_ctx_len,
-                truncation=self.truncation,
-            )
-            context_enc = inputs["input_ids"]
-            if "max_length" not in kwargs:
-                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
-            cont = self._model_multimodal_generate(inputs, stop=until, **kwargs)
-            del inputs
-            torch.cuda.empty_cache()
-            import gc
-            gc.collect()
-            ### essentially same as HFLM beyond this line!
-            cont_toks_list = cont.tolist()
-            for cont_toks, context in zip(cont_toks_list, contexts):
-                # discard context + left-padding toks if using causal decoder-only VLM
-                cont_toks = cont_toks[context_enc.shape[1] :]
-                s = self.tok_decode(cont_toks)
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
-                res.append(s)
-                self.cache_hook.add_partial(
-                    "generate_until", (context, gen_kwargs), s
-                )  # TODO: cache key for multimodal input should be what?
-                pbar.update(1)
-        # reorder this group of results back to original unsorted form
-        res = re_ords.get_original(res)
-        pbar.close()
-        return res

lm-evaluation-harness/lm_eval/models/huggingface.py DELETED Viewed

@@ -1,1480 +0,0 @@
-import copy
-import logging
-import os
-from datetime import timedelta
-from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
-import jinja2
-import torch
-import torch.nn.functional as F
-import transformers
-from accelerate import (
-    Accelerator,
-    InitProcessGroupKwargs,
-    find_executable_batch_size,
-)
-from accelerate.utils import get_max_memory
-from huggingface_hub import HfApi
-from packaging import version
-from peft import PeftModel
-from peft import __version__ as PEFT_VERSION
-from tqdm import tqdm
-from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-)
-from lm_eval import utils
-from lm_eval.api.instance import Instance
-from lm_eval.api.model import TemplateLM
-from lm_eval.api.registry import register_model
-from lm_eval.models.utils import (
-    Collator,
-    clear_torch_cache,
-    configure_pad_token,
-    get_dtype,
-    handle_stop_sequences,
-    pad_and_concat,
-    stop_sequences_criteria,
-)
-eval_logger = logging.getLogger(__name__)
-@register_model("hf-auto", "hf", "huggingface")
-class HFLM(TemplateLM):
-    """
-    An abstracted Huggingface model class. Enables usage with both models of
-    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
-    Supports data-parallel multi-GPU with HF Accelerate.
-    """
-    AUTO_MODEL_CLASS = None
-    _DEFAULT_MAX_LENGTH = 2048
-    def __init__(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        backend: Literal["default", "causal", "seq2seq"] = "default",
-        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
-        revision: Optional[str] = "main",
-        subfolder: Optional[str] = None,
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ] = None,
-        truncation: Optional[bool] = False,
-        logits_cache: bool = True,
-        max_length: Optional[int] = None,
-        device: Optional[str] = "cuda",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
-        batch_size: Optional[Union[int, str]] = 1,
-        max_batch_size: Optional[int] = 64,
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        add_bos_token: Optional[bool] = False,
-        prefix_token_id: Optional[int] = None,
-        # arguments used for splitting a model across GPUs naively.
-        # only used if `parallelize=True`.
-        parallelize: Optional[bool] = False,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
-        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        # optionally: take in an already-initialized transformers.PreTrainedModel
-        if not isinstance(pretrained, str):
-            eval_logger.warning(
-                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
-            )
-            assert not parallelize, (
-                "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
-            )
-            self._model = pretrained
-            self._device = self._model.device
-            self._config = self._model.config
-            gpus = 0
-        else:
-            assert isinstance(device, str)
-            assert isinstance(pretrained, str)
-            assert isinstance(batch_size, (int, str))
-            gpus = torch.cuda.device_count()
-            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
-            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
-            if accelerator.num_processes > 1:
-                self.accelerator = accelerator
-            if "npu" in accelerator.device.type:
-                gpus = torch.npu.device_count()
-            # using one process with no model parallelism
-            if not (parallelize or accelerator.num_processes > 1):
-                # use user-passed device
-                device_list = set(
-                    ["cuda", "cpu"]
-                    + [f"cuda:{i}" for i in range(gpus)]
-                    + ["mps", "mps:0"]
-                    + [f"npu:{i}" for i in range(gpus)]
-                )
-                if device and device in device_list:
-                    self._device = torch.device(device)
-                    eval_logger.info(f"Using device '{device}'")
-                    if device in ("mps", "mps:0") and version.parse(
-                        torch.__version__
-                    ) < version.parse("2.1"):
-                        raise RuntimeError(
-                            f"mps requires torch >= 2.1. You have {torch.__version__}"
-                        )
-                else:
-                    eval_logger.info("Device not specified")
-                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
-                    self._device = (
-                        torch.device("cuda")
-                        if torch.cuda.is_available()
-                        else torch.device("cpu")
-                    )
-            else:  # Parallelism managed by accelerate
-                if device != "cuda":
-                    eval_logger.info(
-                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
-                    )
-                # TODO: include in warning that `load_in_8bit` etc. affect this too
-                self._device = (
-                    self.accelerator.device
-                    if hasattr(self, "accelerator")
-                    else torch.device(device)
-                )
-            revision = str(revision)  # cast to string if not already one
-            # TODO: update this to be less of a hack once subfolder is fixed in HF
-            revision = revision + ("/" + subfolder if subfolder is not None else "")
-            self._get_config(
-                pretrained,
-                revision=revision,
-                trust_remote_code=trust_remote_code,
-                gguf_file=gguf_file,
-            )
-            # determine which of 'causal' and 'seq2seq' backends to use for HF models
-        self._get_backend(
-            config=self.config, backend=backend, trust_remote_code=trust_remote_code
-        )
-        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
-        self._create_tokenizer(
-            pretrained,
-            tokenizer,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            use_fast_tokenizer=use_fast_tokenizer,
-            gguf_file=gguf_file,
-            add_bos_token=add_bos_token,
-        )
-        # if we passed `pretrained` as a string, initialize our model now
-        if isinstance(pretrained, str):
-            self._create_model(
-                pretrained=pretrained,
-                revision=revision,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-                parallelize=parallelize,
-                gpus=gpus,
-                max_memory_per_gpu=max_memory_per_gpu,
-                max_cpu_memory=max_cpu_memory,
-                offload_folder=offload_folder,
-                peft=peft,
-                delta=delta,
-                autogptq=autogptq,
-                gptqmodel=gptqmodel,
-                gguf_file=gguf_file,
-                quantization_config=getattr(self.config, "quantization_config", None),
-                **kwargs,
-            )
-        # access self._model through self.model property outside this method
-        if isinstance(self.model, torch.nn.Module):
-            self.model.eval()
-            self.model.tie_weights()
-        self.truncation = truncation
-        self.logits_cache = logits_cache
-        self.vocab_size = self.tokenizer.vocab_size
-        # select (or create) a pad token to use
-        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
-        self.add_bos_token = add_bos_token
-        if "gemma" in getattr(self.config, "model_type", ""):
-            self.add_bos_token = True
-            eval_logger.info(
-                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
-            )
-        self._max_length = max_length
-        self.pretrained = pretrained
-        self.delta = delta
-        self.peft = peft
-        self.revision = revision
-        self.batch_schedule = 1
-        self.batch_sizes = {}
-        self.max_batch_size = max_batch_size
-        self.softmax_dtype = (
-            get_dtype(softmax_dtype) if softmax_dtype is not None else None
-        )
-        if str(batch_size).startswith("auto"):
-            batch_size = batch_size.split(":")
-            self.batch_size_per_gpu = batch_size[0]
-            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
-        else:
-            self.batch_size_per_gpu = int(batch_size)
-        if isinstance(pretrained, str):
-            if gpus >= 1 or str(self.device) == "mps":
-                # TODO: can remove this whole snippet except in the mps case, perhaps?
-                if not (parallelize or autogptq or hasattr(self, "accelerator")):
-                    # place model onto device requested manually,
-                    # if not using HF Accelerate or device_map
-                    # or any other option that preloads model onto device
-                    try:
-                        self.model.to(self.device)
-                    except ValueError:
-                        eval_logger.debug(
-                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
-                        )
-            # multigpu data-parallel support when launched with accelerate
-            if gpus > 1:
-                if accelerator.num_processes > 1:
-                    if parallelize:
-                        eval_logger.warning(
-                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
-                        )
-                    elif gpus > accelerator.num_processes:
-                        eval_logger.warning(
-                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
-                            "If you would like to use data parallelism, please launch the script "
-                            "with 'accelerate launch *script*'. "
-                            f"Current run will proceed with {accelerator.num_processes} devices."
-                        )
-                        if self.accelerator.is_local_main_process:
-                            eval_logger.info(
-                                f"Using {gpus} devices with data parallelism"
-                            )
-                    self._device = torch.device(f"{accelerator.device}")
-                    self.accelerator = accelerator
-                    self._rank = self.accelerator.local_process_index
-                    self._world_size = self.accelerator.num_processes
-                else:
-                    # if we aren't launching via accelerate, ditch
-                    self._rank = 0
-                    self._world_size = 1
-        else:
-            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
-            eval_logger.warning(
-                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
-            )
-            self._rank = 0
-            self._world_size = 1
-        self.custom_prefix_token_id = prefix_token_id
-        if prefix_token_id is not None:
-            eval_logger.info(
-                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
-            )
-    def _get_accelerate_args(
-        self,
-        parallelize: Optional[bool] = None,
-        device_map: Optional[str] = "auto",
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        gpus: Optional[int] = None,
-    ) -> dict:
-        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
-        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
-        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
-        if (
-            num_machines == 0
-            and hasattr(self, "accelerator")
-            and self.accelerator is not None
-        ):
-            eval_logger.info(
-                "We are not in a distributed setting for accelerate. Setting model_parallel to False."
-            )
-            parallelize = False
-        if parallelize is None:
-            # If parallelism is unset by the user, we automatically assign model parallelism
-            # if enough extra GPUs are available
-            max_memory_all_gpus = get_max_memory()
-            # We just want gpu, not cpu, max memory
-            if "cpu" in max_memory_all_gpus:
-                del max_memory_all_gpus["cpu"]
-            parallelize = bool(num_local_processes < len(max_memory_all_gpus))
-            eval_logger.info(
-                f"Setting model parallel to {parallelize} since "
-                f"the number of local processes is {num_local_processes} "
-                f"and the number of GPUs is {len(max_memory_all_gpus)}"
-            )
-        args = {}
-        if parallelize:  # Model parallelism will be used
-            max_memory = {}
-            if max_memory_per_gpu is not None:  # Using the provided memory requirements
-                max_memory_per_gpu_map = {
-                    device_idx: max_memory_per_gpu for device_idx in range(gpus)
-                }
-            else:  # Estimating the possible memory requirements
-                max_memory_all_gpus = get_max_memory()
-                if "cpu" in max_memory_all_gpus:
-                    del max_memory_all_gpus["cpu"]
-                if not hasattr(self, "accelerator"):
-                    max_memory_per_gpu_map = {
-                        k: v for k, v in max_memory_all_gpus.items()
-                    }
-                else:
-                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
-                    max_memory_per_gpu_map = {
-                        k: v
-                        for k, v in max_memory_all_gpus.items()
-                        if k % num_local_processes
-                        == (self.accelerator.process_index % num_local_processes)
-                    }
-            args["max_memory"] = max_memory_per_gpu_map
-            args["device_map"] = "auto" if device_map is None else device_map
-            eval_logger.info(
-                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to {args.get('device_map')}"
-            )
-            if max_cpu_memory is not None:
-                max_memory["cpu"] = max_cpu_memory
-            args["offload_folder"] = offload_folder
-        elif (
-            device_map is None
-        ):  # No model parallelism, we use the default provided device for our model
-            if hasattr(self, "accelerator"):
-                device_map = {"": f"{self.accelerator.device}"}
-            else:
-                device_map = {"": str(self.device)}
-            args["max_memory"] = None
-            args["device_map"] = device_map
-            eval_logger.info(
-                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
-            )
-        else:
-            args["max_memory"] = None
-            args["device_map"] = None
-            eval_logger.info("Model parallel was set to False.")
-        return args
-    @property
-    def config(self):
-        # return the associated transformers.AutoConfig for the given pretrained model.
-        return self._config
-    @property
-    def model(self):
-        # returns the model, unwrapping it if using Accelerate
-        if hasattr(self, "accelerator"):
-            return self.accelerator.unwrap_model(self._model)
-        else:
-            return self._model
-    @property
-    def eot_token_id(self):
-        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
-        return self.tokenizer.eos_token_id
-    @property
-    def prefix_token_id(self):
-        # it is used as prefix for loglikelihood
-        if self.custom_prefix_token_id is not None:
-            return self.custom_prefix_token_id
-        if self.tokenizer.bos_token_id is not None:
-            return self.tokenizer.bos_token_id
-        return self.tokenizer.eos_token_id
-    @property
-    def max_length(self):
-        if self._max_length:  # if max length manually set, return it
-            return self._max_length
-        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
-        for attr in seqlen_config_attrs:
-            if hasattr(self.model.config, attr):
-                return getattr(self.model.config, attr)
-        if hasattr(self.tokenizer, "model_max_length"):
-            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
-                return self._DEFAULT_MAX_LENGTH
-            return self.tokenizer.model_max_length
-        return self._DEFAULT_MAX_LENGTH
-    @property
-    def max_gen_toks(self) -> int:
-        return 256
-    @property
-    def batch_size(self):
-        return self.batch_size_per_gpu
-    @property
-    def device(self):
-        return self._device
-    @property
-    def rank(self):
-        return self._rank
-    @property
-    def world_size(self):
-        return self._world_size
-    @property
-    def tokenizer_name(self) -> str:
-        return self.tokenizer.name_or_path.replace("/", "__")
-    def _get_backend(
-        self,
-        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
-        backend: Literal["default", "causal", "seq2seq"] = "default",
-        trust_remote_code: Optional[bool] = False,
-    ) -> None:
-        """
-        Helper method during initialization.
-        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
-        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
-        **If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
-        user must set `self.backend` to be either "causal" or "seq2seq" manually!**
-        """
-        assert backend in ["default", "causal", "seq2seq"]
-        if backend != "default":
-            # if we've settled on non-default backend, use that manually
-            if backend == "causal":
-                self.backend = backend
-            elif backend == "seq2seq":
-                self.backend = backend
-            eval_logger.info(
-                f"Overrode HF model backend type, and using type '{self.backend}'"
-            )
-        else:
-            # determine and use the default HF backend for this model, based on its config + metadata.
-            if (
-                getattr(config, "model_type")
-                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
-            ):
-                # first check if model type is listed under seq2seq models, since some
-                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
-                # these special cases should be treated as seq2seq models.
-                self.backend = "seq2seq"
-                eval_logger.debug(f"Using model type '{self.backend}'")
-            elif (
-                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-            ):
-                self.backend = "causal"
-                eval_logger.debug(f"Using model type '{self.backend}'")
-            else:
-                if not trust_remote_code:
-                    eval_logger.warning(
-                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
-                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
-                        "Setting backend to causal"
-                    )
-                # if model type is neither in HF transformers causal or seq2seq model registries
-                # then we default to assuming AutoModelForCausalLM
-                self.backend = "causal"
-                eval_logger.info(
-                    f"Model type cannot be determined. Using default model type '{self.backend}'"
-                )
-        if self.AUTO_MODEL_CLASS is None:
-            if self.backend == "causal":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
-            elif self.backend == "seq2seq":
-                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
-    def _get_config(
-        self,
-        pretrained: str,
-        revision: str = "main",
-        trust_remote_code: bool = False,
-        gguf_file: Optional[str] = None,
-    ) -> None:
-        """Return the model config for HuggingFace models"""
-        self._config = transformers.AutoConfig.from_pretrained(
-            pretrained,
-            revision=revision,
-            trust_remote_code=trust_remote_code,
-            gguf_file=gguf_file,
-        )
-    def _create_model(
-        self,
-        pretrained: str,
-        revision: Optional[str] = "main",
-        dtype: Optional[Union[str, torch.dtype]] = "auto",
-        trust_remote_code: Optional[bool] = False,
-        # arguments used for splitting a model across GPUs naively.
-        # only used if `parallelize=True`.
-        # (accelerate naive PP (device_map) options)
-        parallelize: Optional[bool] = False,
-        gpus: Optional[int] = None,
-        max_memory_per_gpu: Optional[Union[int, str]] = None,
-        max_cpu_memory: Optional[Union[int, str]] = None,
-        offload_folder: Optional[str] = "./offload",
-        # PEFT, delta weights and quantization options
-        peft: Optional[str] = None,
-        delta: Optional[str] = None,
-        autogptq: Optional[Union[bool, str]] = False,
-        gptqmodel: Optional[bool] = False,
-        gguf_file: Optional[str] = None,
-        quantization_config: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> None:
-        """
-        Initializes an HF or HF-compatible PreTrainedModel from scratch
-        inside HFLM, using the kwargs passed into self.__init__().
-        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
-        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
-        (such as PyTorch models that are nearly, but not quite, fully mirroring
-        HF's public interface relied on in this HFLM class)
-        please consider subclassing HFLM and overriding this and other methods as needed.
-        """
-        model_kwargs = kwargs if kwargs else {}
-        model_kwargs.update(
-            self._get_accelerate_args(
-                parallelize=parallelize,
-                device_map=kwargs.get("device_map", None),
-                max_memory_per_gpu=max_memory_per_gpu,
-                max_cpu_memory=max_cpu_memory,
-                offload_folder=offload_folder,
-                gpus=gpus,
-            )
-        )
-        if not autogptq and not gptqmodel:
-            if model_kwargs.get("load_in_4bit", None):
-                assert transformers.__version__ >= "4.30.0", (
-                    "load_in_4bit requires transformers >= 4.30.0"
-                )
-            if transformers.__version__ >= "4.30.0":
-                if model_kwargs.get("load_in_4bit", None):
-                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
-                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
-                            model_kwargs["bnb_4bit_compute_dtype"]
-                        )
-            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
-                pretrained,
-                revision=revision,
-                torch_dtype=get_dtype(dtype),
-                trust_remote_code=trust_remote_code,
-                gguf_file=gguf_file,
-                quantization_config=quantization_config,
-                **model_kwargs,
-            )
-        else:
-            if autogptq and gptqmodel:
-                raise ValueError(
-                    "Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
-                )
-            if autogptq:
-                try:
-                    from auto_gptq import AutoGPTQForCausalLM
-                except ModuleNotFoundError as exception:
-                    raise type(exception)(
-                        "Tried to load auto_gptq, but auto-gptq is not installed ",
-                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
-                    )
-                self._model = AutoGPTQForCausalLM.from_quantized(
-                    pretrained,
-                    trust_remote_code=trust_remote_code,
-                    model_basename=None if autogptq is True else Path(autogptq).stem,
-                    use_safetensors=True
-                    if autogptq is True
-                    else autogptq.endswith(".safetensors"),
-                    **model_kwargs,
-                )
-            if gptqmodel:
-                try:
-                    from gptqmodel import GPTQModel
-                except ModuleNotFoundError as exception:
-                    raise type(exception)(
-                        "Tried to load gptqmodel, but gptqmodel is not installed ",
-                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
-                    )
-                self._model = GPTQModel.from_quantized(
-                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
-                )
-        if peft and delta:
-            raise ValueError(
-                "Cannot use both 'peft' and 'delta' options at the same time."
-            )
-        if peft:
-            if model_kwargs.get("load_in_4bit", None):
-                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
-                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
-            if self._model.config.vocab_size != len(self.tokenizer):
-                # resize model for LoRAs with added tokens
-                eval_logger.info(
-                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
-                )
-                self._model.resize_token_embeddings(len(self.tokenizer))
-            self._model = PeftModel.from_pretrained(
-                self._model, peft, revision=revision
-            )
-        elif delta:
-            if autogptq:
-                eval_logger.warning(
-                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
-                )
-            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
-                delta,
-                revision=revision,
-                torch_dtype=get_dtype(dtype),
-                trust_remote_code=trust_remote_code,
-                **model_kwargs,
-            )
-            for name, param in self._model.state_dict().items():
-                try:
-                    param.data += _model_delta.state_dict()[name]
-                except KeyError:
-                    raise KeyError(f"Delta model is missing weights for layer: {name}")
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Failed to add delta weights to layer {name}. Error: {e}"
-                    )
-            del _model_delta
-        return None
-    def _create_tokenizer(
-        self,
-        pretrained: Union[str, transformers.PreTrainedModel],
-        tokenizer: Optional[
-            Union[
-                str,
-                transformers.PreTrainedTokenizer,
-                transformers.PreTrainedTokenizerFast,
-            ]
-        ],
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
-        gguf_file: Optional[str] = None,
-        add_bos_token: Optional[bool] = False,
-    ) -> None:
-        """
-        Helper method during initialization.
-        Create a tokenizer object corresponding to the correct
-        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
-        """
-        kwargs = {
-            "revision": revision,
-            "trust_remote_code": trust_remote_code,
-        }
-        # gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param
-        if gguf_file is not None:
-            kwargs["gguf_file"] = gguf_file
-        else:
-            kwargs["use_fast"] = use_fast_tokenizer
-        if add_bos_token:
-            kwargs["add_bos_token"] = True
-        if tokenizer:
-            if isinstance(tokenizer, str):
-                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                    tokenizer, **kwargs
-                )
-            else:
-                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
-                self.tokenizer = tokenizer
-        else:
-            # Get tokenizer based on 'pretrained'
-            if isinstance(pretrained, str):
-                model_name = pretrained
-            else:
-                # get the HF hub name via accessor on model
-                model_name = self.model.name_or_path
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                model_name, **kwargs
-            )
-        return None
-    def _detect_batch_size(self, requests=None, pos: int = 0):
-        if requests:
-            _, context_enc, continuation_enc = requests[pos]
-            max_length = len(
-                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
-            )
-            max_context_enc = len(context_enc[-(self.max_length + 1) :])
-            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
-        else:
-            max_length = self.max_length
-            max_context_enc = max_length
-            max_cont_enc = max_length
-        # if OOM, then halves batch_size and tries again
-        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
-        def forward_batch(batch_size):
-            if self.backend == "seq2seq":
-                length = max(max_context_enc, max_cont_enc)
-                batched_conts = torch.ones(
-                    (batch_size, length), device=self.device
-                ).long()
-                test_batch = torch.ones((batch_size, length), device=self.device).long()
-                call_kwargs = {
-                    "attn_mask": test_batch,
-                    "labels": batched_conts,
-                }
-            else:
-                call_kwargs = {}
-                test_batch = torch.ones(
-                    (batch_size, max_length), device=self.device
-                ).long()
-            for _ in range(5):
-                out = F.log_softmax(  # noqa: F841
-                    self._model_call(test_batch, **call_kwargs),
-                    dim=-1,
-                    dtype=self.softmax_dtype,
-                )
-            return batch_size
-        try:
-            batch_size = forward_batch()
-        except RuntimeError as e:
-            if "No executable batch size found" in str(e):
-                batch_size = 1
-            else:
-                raise
-        if self.world_size > 1:
-            # if multi-GPU, always take minimum over all selected batch sizes
-            max_rnk_bs = torch.tensor([batch_size], device=self.device)
-            gathered = (
-                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
-            )
-            batch_size = min(gathered)
-            clear_torch_cache()
-            return batch_size
-        clear_torch_cache()
-        return batch_size
-    def tok_encode(
-        self, string: str, left_truncate_len=None, add_special_tokens=None
-    ) -> List[int]:
-        """ """
-        # default for None - empty dict, use predefined tokenizer param
-        # used for all models except for CausalLM or predefined value
-        special_tokens_kwargs = {}
-        # by default for CausalLM - false or self.add_bos_token is set
-        if add_special_tokens is None:
-            if self.backend == "causal":
-                special_tokens_kwargs = {
-                    "add_special_tokens": False or self.add_bos_token
-                }
-        # otherwise the method explicitly defines the value
-        else:
-            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
-        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
-        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
-        if left_truncate_len:
-            encoding = encoding[-left_truncate_len:]
-        return encoding
-    def tok_batch_encode(
-        self,
-        strings: List[str],
-        padding_side: str = "left",
-        left_truncate_len: int = None,
-        truncation: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
-        old_padding_side = self.tokenizer.padding_side
-        self.tokenizer.padding_side = padding_side
-        add_special_tokens = {}
-        if self.backend == "causal":
-            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
-        encoding = self.tokenizer(
-            strings,
-            truncation=truncation,
-            padding="longest",
-            return_tensors="pt",
-            **add_special_tokens,
-        )
-        if left_truncate_len:
-            original_lengths = encoding["input_ids"].size(1)
-            if original_lengths > left_truncate_len:
-                eval_logger.warn(
-                    f"Left truncation applied. Original sequence length was {original_lengths}, "
-                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
-                )
-            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-            encoding["attention_mask"] = encoding["attention_mask"][
-                :, -left_truncate_len:
-            ]
-        self.tokenizer.padding_side = old_padding_side
-        return encoding["input_ids"], encoding["attention_mask"]
-    def tok_decode(self, tokens, skip_special_tokens=True):
-        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
-    def _model_call(self, inps, attn_mask=None, labels=None):
-        """
-        :param inps: torch.Tensor
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
-            [batch, sequence_ctx]. the size of sequence may vary from call to call
-        :param attn_mask: torch.Tensor, optional
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
-            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
-        :param labels: torch.Tensor, optional
-            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
-            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
-        :return
-            A torch tensor of shape [batch, sequence, vocab] with the
-        logits returned from the model's decoder
-        """
-        with torch.no_grad():
-            if attn_mask is not None or labels is not None:
-                assert attn_mask is not None and labels is not None
-                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
-                return self.model(
-                    input_ids=inps, attention_mask=attn_mask, labels=labels
-                ).logits
-            else:
-                assert self.AUTO_MODEL_CLASS in (
-                    transformers.AutoModelForCausalLM,
-                    transformers.AutoModelForVision2Seq,
-                )
-                return self.model(inps).logits
-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        # temperature = 0.0 if not set
-        # if do_sample is false and temp==0.0:
-        # remove temperature, as do_sample=False takes care of this
-        # and we don't want a warning from HF
-        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
-        do_sample = generation_kwargs.get("do_sample", None)
-        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
-        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
-            generation_kwargs["do_sample"] = do_sample = False
-        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
-            generation_kwargs.pop("temperature")
-        # build stopping criteria
-        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, context.shape[1], context.shape[0]
-        )
-        return self.model.generate(
-            input_ids=context,
-            max_length=max_length,
-            stopping_criteria=stopping_criteria,
-            pad_token_id=self.tokenizer.pad_token_id,
-            use_cache=True,
-            **generation_kwargs,
-        )
-    def _select_cont_toks(
-        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
-    ) -> torch.Tensor:
-        if self.backend == "causal":
-            assert contlen and inplen, (
-                "Must pass input len and cont. len to select scored logits for causal LM"
-            )
-            # discard right-padding.
-            # also discard the input/context tokens. we'll only score continuations.
-            logits = logits[inplen - contlen : inplen]
-        elif self.backend == "seq2seq":
-            assert contlen and not inplen, (
-                "Selecting scored logits for Seq2SeqLM requires only cont. len"
-            )
-            # only discard right-padding.
-            # the logits input to this fn only contain decoder-side tokens.
-            logits = logits[:contlen]
-        return logits
-    def loglikelihood_rolling(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[float]:
-        adaptive_batch_size = None
-        if self.batch_size == "auto":
-            # using rolling window with maximum context
-            print("Passed argument batch_size = auto. Detecting largest batch size")
-            batch_size = self._detect_batch_size()
-            print(f"Determined Largest batch size: {batch_size}")
-            adaptive_batch_size = batch_size
-        # First, collect all windows from all requests
-        all_windows = []  # List of (request_idx, window) tuples
-        request_window_counts = []  # Track number of windows per request
-        for req_idx, (string,) in enumerate(
-            tqdm(
-                [req.args for req in requests],
-                disable=(disable_tqdm or (self.rank != 0)),
-            )
-        ):
-            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
-                map(
-                    utils.make_disjoint_window,
-                    utils.get_rolling_token_windows(
-                        token_list=self.tok_encode(string),
-                        prefix_token=self.prefix_token_id,
-                        max_seq_len=self.max_length,
-                        context_len=1,
-                    ),
-                )
-            )
-            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
-            windows = [(None,) + x for x in rolling_token_windows]
-            # Store windows with their request index
-            all_windows.extend((req_idx, window) for window in windows)
-            request_window_counts.append(len(windows))
-        # Handle distributed case padding
-        pad_amnt = 0
-        if self.world_size > 1:
-            mytensor = torch.tensor(len(all_windows), device=self.device)
-            gathered = self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
-            pad_amnt = max(gathered) - gathered[self.rank]
-            if pad_amnt > 0:
-                all_windows += pad_amnt * [all_windows[0]]
-        all_nlls = []
-        batch_size = adaptive_batch_size or self.batch_size
-        for i in range(0, len(all_windows), batch_size):
-            batch = all_windows[i : i + batch_size]
-            # Extract just the windows for processing, keeping track of request indices
-            batch_indices, batch_windows = zip(*batch)
-            batch_nlls = self._loglikelihood_tokens(
-                requests=batch_windows,
-                disable_tqdm=False,
-                override_bs=len(batch_windows),
-            )
-            # Store results with their request indices
-            all_nlls.extend(zip(batch_indices, batch_nlls))
-        # Remove padding if necessary
-        if (self.world_size > 1) and (pad_amnt > 0):
-            all_nlls = all_nlls[:-pad_amnt]
-        # Reconstruct per-request loglikelihoods
-        loglikelihoods = []
-        current_idx = 0
-        for window_count in request_window_counts:
-            # Get all nlls for this request
-            request_nlls = all_nlls[current_idx : current_idx + window_count]
-            # Sum up the nlls for this request (discarding is_greedy)
-            request_total = sum(nll[0] for _, nll in request_nlls)
-            loglikelihoods.append(request_total)
-            current_idx += window_count
-            string = requests[len(loglikelihoods) - 1].args[0]
-            self.cache_hook.add_partial(
-                "loglikelihood_rolling", (string,), request_total
-            )
-        return loglikelihoods
-    def _batch_scheduler(self, pos, n_reordered_requests):
-        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
-        if sched in self.batch_sizes:
-            return self.batch_sizes[sched]
-        if (len(self.batch_sizes) > 1) and (
-            self.batch_sizes[sched - 1] == self.max_batch_size
-        ):
-            # if previous batch size is already maximal, skip recomputation
-            self.batch_sizes[sched] = self.max_batch_size
-            return self.batch_sizes[sched]
-        print(
-            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
-        )
-        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
-        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
-        return self.batch_sizes[sched]
-    def _loglikelihood_tokens(
-        self,
-        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
-        disable_tqdm: bool = False,
-        override_bs: int = None,
-    ) -> List[Tuple[float, bool]]:
-        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
-        res = []
-        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key for the sorted method"""
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = req[1] + req[2]
-            return -len(toks), tuple(toks)
-        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
-            """Defines the key to group and lookup one-token continuations"""
-            # Use with group_by="contexts" (optional)"
-            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
-            # speeds up some multiple-choice tasks proportionally to the number of choices.
-            # groups requests by context+continuation[:-1] and infer on one request/group.
-            return req[-2] + req[-1][:-1]
-        re_ord = Collator(
-            requests,
-            sort_fn=_collate,
-            group_by="contexts"
-            if self.backend == "causal" and self.logits_cache
-            else None,
-            group_fn=_lookup_one_token_cont,
-        )
-        # automatic (variable) batch size detection for vectorization
-        # pull longest context sample from request
-        n_reordered_requests = len(re_ord)
-        batch_size = (
-            self.batch_size
-            if self.batch_size != "auto"
-            else override_bs
-            if override_bs is not None
-            else 0
-        )
-        batch_fn = (
-            self._batch_scheduler
-            if self.batch_size == "auto"
-            and n_reordered_requests > 0
-            and not override_bs
-            else None
-        )
-        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
-        pbar = tqdm(
-            total=len(requests),
-            disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running loglikelihood requests",
-        )
-        for chunk in chunks:
-            inps = []
-            cont_toks_list = []
-            inplens = []
-            conts = []
-            encoder_attns = []
-            padding_len_inp = None
-            padding_len_cont = None
-            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
-            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
-            # again because vectorizing is annoying
-            for _, context_enc, continuation_enc in chunk:
-                # sanity check
-                assert len(context_enc) > 0
-                assert len(continuation_enc) > 0
-                assert len(continuation_enc) <= self.max_length
-                # how this all works (illustrated on a causal decoder-only setup):
-                #          CTX      CONT
-                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
-                # model  \               \
-                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
-                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
-                # when too long to fit in context, truncate from the left
-                if self.backend == "causal":
-                    total_length = len(context_enc) + len(continuation_enc)
-                    if total_length > self.max_length + 1:
-                        eval_logger.warning(
-                            f"Combined length of context ({len(context_enc)}) and continuation ({len(continuation_enc)}) "
-                            f"exceeds model's maximum length ({self.max_length}). "
-                            f"Truncating {total_length - self.max_length + 1} tokens from the left."
-                        )
-                    inp = torch.tensor(
-                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
-                        dtype=torch.long,
-                        device=self.device,
-                    )
-                    (inplen,) = inp.shape
-                elif self.backend == "seq2seq":
-                    inp = torch.tensor(
-                        (context_enc)[-self.max_length :],
-                        dtype=torch.long,
-                        device=self.device,
-                    )
-                    (inplen,) = inp.shape
-                    # build encoder attn masks
-                    encoder_attns.append(torch.ones_like(inp))
-                    cont = torch.tensor(
-                        (continuation_enc)[-self.max_length :],
-                        # TODO: left-shift these?
-                        # TODO: our code assumes we never end up truncating conts for either model type
-                        dtype=torch.long,
-                        device=self.device,
-                    )
-                    (contlen,) = cont.shape
-                    conts.append(cont)
-                    padding_len_cont = (
-                        max(padding_len_cont, contlen)
-                        if padding_len_cont is not None
-                        else contlen
-                    )
-                padding_len_inp = (
-                    max(padding_len_inp, inplen)
-                    if padding_len_inp is not None
-                    else inplen
-                )
-                inps.append(inp)  # [1, inp_length]
-                cont_toks_list.append(continuation_enc)
-                inplens.append(inplen)
-            # create encoder attn mask and batched conts, if seq2seq
-            call_kwargs = {}
-            if self.backend == "causal":
-                batched_inps = pad_and_concat(
-                    padding_len_inp, inps, padding_side="right"
-                )  # [batch, padding_len_inp]
-            elif self.backend == "seq2seq":
-                # TODO: left-pad encoder inps and mask?
-                batched_inps = pad_and_concat(
-                    padding_len_inp, inps
-                )  # [batch, padding_len_inp]
-                batched_conts = pad_and_concat(
-                    padding_len_cont, conts
-                )  # [batch, padding_len_cont]
-                batched_encoder_mask = pad_and_concat(
-                    padding_len_inp, encoder_attns
-                )  # [batch, padding_len_inp]
-                call_kwargs = {
-                    "attn_mask": batched_encoder_mask,
-                    "labels": batched_conts,
-                }
-            multi_logits = F.log_softmax(
-                self._model_call(batched_inps, **call_kwargs),
-                dim=-1,
-                dtype=self.softmax_dtype,
-            )  # [batch, padding_length (inp or cont), vocab]
-            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
-                chunk, multi_logits, inplens, cont_toks_list
-            ):
-                # Slice to original seq length
-                contlen = len(cont_toks)
-                # take only logits in the continuation
-                # (discard context toks if decoder-only ; discard right-padding)
-                # also discards + checks for "virtual tokens" in the causal LM's input window
-                # from prompt/prefix tuning tokens, if applicable
-                ctx_len = (
-                    inplen + (logits.shape[0] - padding_len_inp)
-                    if self.backend == "causal"
-                    else None
-                )
-                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
-                logits = logits.unsqueeze(0)  # [1, seq, vocab]
-                # Check if per-token argmax is exactly equal to continuation
-                greedy_tokens = logits.argmax(dim=-1)
-                # check for one-token continuation cache hits.
-                # noop in case group_by != "contexts" or no cache hit and returns the
-                # original args. Otherwise, expands the logits batch dimension and yields each
-                # batch along with matching continuation tokens and prompt strings.
-                # logits -> [1, seq, vocab]
-                for request_str, cont_toks, logits in re_ord.get_cache(
-                    req_str=request_str,
-                    cxt_toks=ctx_tokens,
-                    cont_toks=cont_toks,
-                    logits=logits,
-                ):
-                    cont_toks = torch.tensor(
-                        cont_toks, dtype=torch.long, device=self.device
-                    ).unsqueeze(0)  # [1, seq]
-                    # Use trailing slice [-cont_toks.shape[1]:] to handle variable length cont_len (but same ctx+cont[:-1]).
-                    # i.e. continuations can be sliced at diff points. Collator ensures we have sufficient greedy_tokens
-                    # by choosing key with longest cont if group_by="contexts".
-                    max_equal = (
-                        greedy_tokens[:, -cont_toks.shape[1] :] == cont_toks
-                    ).all()
-                    # Obtain log-probs at the corresponding continuation token indices
-                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                        -1
-                    )  # [1, seq]
-                    # Answer: (log prob, is-exact-match)
-                    answer = (float(logits.sum()), bool(max_equal))
-                    res.append(answer)
-                    if request_str is not None:
-                        # special case: loglikelihood_rolling produces a number of loglikelihood requests
-                        # all with cache key None. instead do add_partial on the per-example level
-                        # in the loglikelihood_rolling() function for those.
-                        self.cache_hook.add_partial(
-                            "loglikelihood", request_str, answer
-                        )
-                    pbar.update(1)
-        pbar.close()
-        return re_ord.get_original(res)
-    def generate_until(
-        self, requests: List[Instance], disable_tqdm: bool = False
-    ) -> List[str]:
-        res = []
-        def _collate(req: Tuple[str, dict]):
-            """Defines the key for the sorted method"""
-            # the negative sign on len(toks) sorts descending - this has a few advantages:
-            # - time estimates will always be over not underestimates, which is more useful for planning
-            # - to know the size of a batch when going through the list, you know the first one is always the batch
-            #   padded context length. this is useful to simplify the batching logic and more importantly to make
-            #   automatic adaptive batches much much easier to implement
-            # - any OOMs will happen right away rather than near the end
-            toks = self.tok_encode(req[0])
-            return -len(toks), req[0]
-        pbar = tqdm(
-            total=len(requests),
-            disable=(disable_tqdm or (self.rank != 0)),
-            desc="Running generate_until requests",
-        )
-        adaptive_batch_size = None
-        if self.batch_size == "auto":
-            # using rolling window with maximum context
-            print("Passed argument batch_size = auto. Detecting largest batch size")
-            batch_size = self._detect_batch_size()
-            print(f"Determined Largest batch size: {batch_size}")
-            adaptive_batch_size = batch_size
-        # for each different set of kwargs, we execute all requests, by batch.
-        batch_size = (
-            self.batch_size
-            if self.batch_size != "auto"
-            else adaptive_batch_size
-            if adaptive_batch_size is not None
-            else 0
-        )
-        batch_fn = (
-            self._batch_scheduler
-            if self.batch_size == "auto" and not adaptive_batch_size
-            else None
-        )
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
-        re_ords = Collator(
-            [reg.args for reg in requests],
-            sort_fn=_collate,
-            group_by="gen_kwargs",
-            group_fn=lambda x: x[1],
-        )
-        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
-        eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
-        for chunk in chunks:
-            contexts, all_gen_kwargs = zip(*chunk)
-            # we assume all gen kwargs in the batch are the same
-            # this is safe to assume because the `grouper` object ensures it.
-            gen_kwargs = all_gen_kwargs[0]
-            # unpack our keyword arguments.
-            if isinstance(gen_kwargs, dict):
-                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                # add EOS token to stop sequences
-                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
-            else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
-            if "max_gen_toks" in kwargs.keys():
-                max_gen_toks = kwargs.pop("max_gen_toks")
-            else:
-                max_gen_toks = self.max_gen_toks
-            # set the max length in tokens of inputs ("context_enc")
-            if self.backend == "causal":
-                # max len for inputs = max length, minus room to generate the max new tokens
-                max_ctx_len = self.max_length - max_gen_toks
-                assert max_ctx_len > 0, (
-                    f"Invalid configuration: requested max tokens to generate ({max_gen_toks}) must be less than model's maximum sequence length ({self.max_length})."
-                )
-            elif self.backend == "seq2seq":
-                # max len for inputs = encoder's whole max_length
-                max_ctx_len = self.max_length
-            # encode, pad, and truncate contexts for this batch
-            context_enc, attn_masks = self.tok_batch_encode(
-                contexts,
-                left_truncate_len=max_ctx_len,
-                truncation=self.truncation,
-            )
-            context_enc = context_enc.to(self.device)
-            attn_masks = attn_masks.to(self.device)
-            if "max_length" not in kwargs:
-                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
-            # perform batched generation
-            cont = self._model_generate(
-                context=context_enc,
-                attention_mask=attn_masks,
-                stop=until,
-                **kwargs,
-            )
-            cont_toks_list = cont.tolist()
-            for cont_toks, context in zip(cont_toks_list, contexts):
-                # discard context + left-padding toks if using causal decoder-only LM
-                if self.backend == "causal":
-                    cont_toks = cont_toks[context_enc.shape[1] :]
-                s = self.tok_decode(cont_toks)
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                for term in until:
-                    if len(term) > 0:
-                        # ignore '' separator,
-                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                        s = s.split(term)[0]
-                res.append(s)
-                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
-                pbar.update(1)
-        # reorder this group of results back to original unsorted form
-        res = re_ords.get_original(res)
-        pbar.close()
-        return res
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
-    ) -> str:
-        """
-        Method to apply a chat template to a list of chat history between user and model.
-        """
-        try:
-            chat_templated = self.tokenizer.apply_chat_template(
-                chat_history,
-                tokenize=False,
-                add_generation_prompt=add_generation_prompt,
-                continue_final_message=not add_generation_prompt,
-            )
-        except jinja2.exceptions.TemplateError:
-            eval_logger.warning(
-                "Failed to apply chat template. removing the system role in chat history."
-            )
-            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
-            chat_templated = self.tokenizer.apply_chat_template(
-                chat_history,
-                tokenize=False,
-                add_generation_prompt=add_generation_prompt,
-                continue_final_message=not add_generation_prompt,
-            )
-        return chat_templated
-    def get_model_info(self) -> dict:
-        """
-        Method to get Hugging Face model information for experiment reproducibility.
-        """
-        def get_model_num_params(model) -> int:
-            if hasattr(model, "num_parameters"):
-                return model.num_parameters()
-            if hasattr(model, "parameters"):
-                return sum(p.numel() for p in model.parameters())
-            else:
-                return -1
-        def get_model_dtype(model) -> str:
-            if hasattr(model, "dtype"):
-                return model.dtype
-            else:
-                return ""
-        def get_model_sha(pretrained: str, revision: str) -> str:
-            try:
-                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
-                return model_info.sha
-            except Exception as e:
-                eval_logger.debug(
-                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
-                )
-                return ""
-        model_info = {
-            "model_num_parameters": get_model_num_params(self._model),
-            "model_dtype": get_model_dtype(self._model),
-            "model_revision": self.revision,
-            "model_sha": get_model_sha(self.pretrained, self.revision),
-        }
-        if self.peft:
-            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
-        if self.delta:
-            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
-        return model_info

lm-evaluation-harness/lm_eval/models/ibm_watsonx_ai.py DELETED Viewed

@@ -1,445 +0,0 @@
-import copy
-import json
-import logging
-import os
-import warnings
-from functools import lru_cache
-from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
-from tqdm import tqdm
-from lm_eval.api.instance import Instance
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import simple_parse_args_string
-eval_logger = logging.getLogger(__name__)
-class LogLikelihoodResult(NamedTuple):
-    log_likelihood: float
-    is_greedy: bool
-def _verify_credentials(creds: dict) -> None:
-    """
-    Validate credentials for APIClient authentication.
-    Required conditions:
-    - Either ("username" and "password") or "apikey" must be present.
-    - "url" is mandatory.
-    - Either "project_id" or "space_id" must be present.
-    """
-    env_var_map = {
-        "apikey": "WATSONX_API_KEY",
-        "token": "WATSONX_TOKEN",
-        "url": "WATSONX_URL",
-        "project_id": "WATSONX_PROJECT_ID",
-        "space_id": "WATSONX_SPACE_ID",
-        "username": "WATSONX_USERNAME",
-        "password": "WATSONX_PASSWORD",
-    }
-    # Check authentication: Either ("username" and "password") or "apikey" must be provided
-    has_auth = all(creds.get(key) for key in ["username", "password"]) or creds.get(
-        "apikey"
-    )
-    # Check required fields: "url" must be present
-    has_url = "url" in creds and creds["url"]
-    # Check project/space ID requirement: Either "project_id" or "space_id" must be present
-    has_project_or_space_id = any(creds.get(key) for key in ["project_id", "space_id"])
-    if not (has_auth and has_url and has_project_or_space_id):
-        missing_keys = []
-        if not has_auth:
-            missing_keys.append(
-                f"either ('username' and 'password') or 'apikey' ({env_var_map['apikey']})"
-            )
-        if not has_url:
-            missing_keys.append(f"url ({env_var_map['url']})")
-        if not has_project_or_space_id:
-            missing_keys.append(
-                f"either 'project_id' ({env_var_map['project_id']}) or 'space_id' ({env_var_map['space_id']})"
-            )
-        error_msg = f"Missing required credentials: {', '.join(missing_keys)}. "
-        error_msg += "Please set the environment variables indicated in parentheses."
-        raise ValueError(error_msg)
-@lru_cache(maxsize=None)
-def get_watsonx_credentials() -> Dict[str, str]:
-    """
-    Retrieves Watsonx API credentials from environmental variables.
-    Returns:
-        Dict[str, str]: A dictionary containing the credentials necessary for authentication, including
-                        keys such as `apikey` or `token`, `url`, and `project_id`.
-    Raises:
-        AssertionError: If the credentials format is invalid or any of the necessary credentials are missing.
-    """
-    try:
-        from dotenv import load_dotenv
-    except ImportError:
-        raise ImportError(
-            "Could not import dotenv: Please install lm_eval[ibm_watsonx_ai] package."
-        )
-    # This function attempts to load a file named .env starting from the CWD and working backwards
-    # towards root. KV pairs are parsed and stored as env vars iff not already set
-    load_dotenv()
-    credentials = {
-        "username": os.getenv("WATSONX_USERNAME", None),
-        "password": os.getenv("WATSONX_PASSWORD", None),
-        "apikey": os.getenv("WATSONX_API_KEY", None),
-        "token": os.getenv("WATSONX_TOKEN", None),
-        "url": os.getenv("WATSONX_URL", None),
-        "project_id": os.getenv("WATSONX_PROJECT_ID", None),
-        "space_id": os.getenv("WATSONX_SPACE_ID", None),
-    }
-    if "cloud.ibm.com" not in credentials["url"]:
-        credentials["instance_id"] = "openshift"
-    if all(credentials.get(key) for key in ["username", "password", "apikey"]):
-        warnings.warn(
-            "You're passing `username`, `password`, and `apikey` at the same time, "
-            "which might cause issues. More info on authentication in different scenarios "
-            "can be found in the docs: https://ibm.github.io/watsonx-ai-python-sdk/setup_cpd.html"
-        )
-    _verify_credentials(credentials)
-    return credentials
-@register_model("watsonx_llm")
-class WatsonxLLM(LM):
-    """
-    Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
-    See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
-    """
-    @classmethod
-    def create_from_arg_string(
-        cls: Type["WatsonxLLM"],
-        arg_string: str,
-        additional_config: Optional[Dict] = None,
-    ) -> "WatsonxLLM":
-        """
-        Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
-        """
-        try:
-            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
-        except ImportError:
-            raise ImportError(
-                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
-            )
-        args = simple_parse_args_string(arg_string)
-        args.update(additional_config)
-        model_id = args.pop("model_id", None)
-        deployment_id = args.pop("deployment_id", None)
-        if model_id is None and deployment_id is None:
-            raise ValueError(
-                "'model_id' or 'deployment_id' is required, please pass it in 'model_args'"
-            )
-        if not args.get("do_sample", None):
-            args["temperature"] = None
-            args["top_p"] = None
-            args["top_k"] = None
-            args["seed"] = None
-        generate_params = {
-            GenParams.DECODING_METHOD: (
-                "greedy" if not args.get("do_sample", None) else "sample"
-            ),
-            GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
-            GenParams.TEMPERATURE: args.get("temperature", None),
-            GenParams.TOP_P: args.get("top_p", None),
-            GenParams.TOP_K: args.get("top_k", None),
-            GenParams.RANDOM_SEED: args.get("seed", None),
-            GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
-            GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
-            GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
-            GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
-            GenParams.TIME_LIMIT: args.get("time_limit", None),
-            GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
-            GenParams.RETURN_OPTIONS: {
-                "generated_tokens": True,
-                "input_tokens": True,
-                "token_logprobs": True,
-                "token_ranks": True,
-            },
-        }
-        generate_params = {k: v for k, v in generate_params.items() if v is not None}
-        return cls(
-            watsonx_credentials=get_watsonx_credentials(),
-            model_id=model_id,
-            deployment_id=deployment_id,
-            generate_params=generate_params,
-        )
-    def __init__(
-        self,
-        watsonx_credentials: Dict,
-        model_id,
-        deployment_id,
-        generate_params: Optional[Dict[Any, Any]] = None,
-    ) -> None:
-        try:
-            from ibm_watsonx_ai import APIClient
-            from ibm_watsonx_ai.foundation_models import ModelInference
-        except ImportError:
-            raise ImportError(
-                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
-            )
-        super().__init__()
-        client = APIClient(watsonx_credentials)
-        project_id = watsonx_credentials.get("project_id", None)
-        client.set.default_project(project_id)
-        self.generate_params = generate_params
-        self.model = ModelInference(
-            model_id=model_id,
-            deployment_id=deployment_id,
-            api_client=client,
-            project_id=project_id,
-        )
-        self._model_id = model_id
-    @staticmethod
-    def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
-        """
-        Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
-        If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
-        misalignment between the tokens generated by the tokenizer and the model.
-        Args:
-            response_tokens (List[str]): The List of tokens generated as a response by the model.
-            context_tokens (List[str]): The List of tokens representing the input context.
-        Returns:
-            bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
-                  otherwise raises an exception.
-        Raises:
-            RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
-        """
-        context_length = len(context_tokens)
-        if response_tokens[: context_length - 1] == context_tokens[:-1]:
-            return (
-                response_tokens[-1] != context_tokens[-1]
-            )  # only last token differs, probably stop sequence (</s>)
-        raise RuntimeError(
-            f"There is an unexpected difference between tokenizer and model tokens:\n"
-            f"context_tokens={context_tokens}\n"
-            f"response_tokens={response_tokens[:context_length]}"
-        )
-    def _check_model_logprobs_support(self):
-        """
-        Verifies if the model supports returning log probabilities for input tokens.
-        This function sends a prompt to the model and checks whether the model's response
-        includes log probabilities for the input tokens. If log probabilities are not present,
-        it raises a `RuntimeError`, indicating that the model is not supported.
-        Raises:
-            RuntimeError: If the model does not return log probabilities for input tokens.
-        """
-        tokens = self.model.generate_text(
-            prompt=["The best ice cream flavor is:"],
-            params=self.generate_params,
-            raw_response=True,
-        )[0]["results"][0]
-        if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
-            raise RuntimeError(
-                f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
-            )
-    def _get_log_likelihood(
-        self,
-        input_tokens: List[Dict[str, float]],
-        context_tokens: List[Dict[str, float]],
-    ) -> LogLikelihoodResult:
-        """
-        Calculates the log likelihood of the generated tokens compared to the context tokens.
-        Args:
-            input_tokens (List[Dict[str, float]]): A List of token dictionaries, each containing
-                token information like `text` and `logprob`.
-            context_tokens (List[Dict[str, float]]): A List of token dictionaries representing
-                the input context.
-        Returns:
-            LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
-            flag indicating if the tokens were generated greedily.
-        """
-        response_tokens = [token["text"] for token in input_tokens]
-        context_length = len(context_tokens)
-        if self._has_stop_token(response_tokens, context_tokens):
-            context_length -= 1
-        return LogLikelihoodResult(
-            log_likelihood=sum(
-                token.get("logprob", 0) for token in input_tokens[context_length:]
-            ),
-            is_greedy=all(
-                token["rank"] == 1 for token in input_tokens[context_length:]
-            ),
-        )
-    def generate_until(self, requests: List[Instance]) -> List[str]:
-        """
-        Generates text responses for a List of requests, with progress tracking and caching.
-        Args:
-            requests (List[Instance]): A List of instances, each containing a text input to be processed.
-        Returns:
-            List[str]: A List of generated responses.
-        """
-        requests = [request.args for request in requests]
-        results = []
-        for request in tqdm(
-            requests,
-            desc="Running generate_until function ...",
-        ):
-            context, continuation = request
-            try:
-                if isinstance(context, JsonChatStr):
-                    context = json.loads(context.prompt)
-                    response = self.model.chat(context, self.generate_params)
-                    response = response["choices"][0]["message"]["content"]
-                else:
-                    response = self.model.generate_text(context, self.generate_params)
-            except Exception as exp:
-                eval_logger.error("Error while generating text.")
-                raise exp
-            results.append(response)
-            self.cache_hook.add_partial(
-                "generate_until", (context, continuation), response
-            )
-        return results
-    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
-        """
-        Args:
-            requests: Each request contains Instance.args : Tuple[str, str] containing:
-                1. an input string to the LM and
-                2. a target string on which the loglikelihood of the LM producing this target,
-                   conditioned on the input, will be returned.
-        Returns:
-            Tuple (loglikelihood, is_greedy) for each request according to the input order:
-                loglikelihood: probability of generating the target string conditioned on the input
-                is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
-        """
-        try:
-            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
-        except ImportError:
-            raise ImportError(
-                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
-            )
-        self._check_model_logprobs_support()
-        generate_params = copy.copy(self.generate_params)
-        generate_params[GenParams.MAX_NEW_TOKENS] = 1
-        requests = [request.args for request in requests]
-        results: List[LogLikelihoodResult] = []
-        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
-        for request in tqdm(
-            requests,
-            desc="Running loglikelihood function ...",
-        ):
-            context, continuation = request
-            try:
-                tokenized_context = self.model.tokenize(
-                    prompt=context, return_tokens=True
-                )["result"]["tokens"]
-            except Exception as exp:
-                eval_logger.error("Error while model tokenize.")
-                raise exp
-            input_prompt = context + continuation
-            try:
-                response = self.model.generate_text(
-                    prompt=input_prompt, params=generate_params, raw_response=True
-                )
-            except Exception as exp:
-                eval_logger.error("Error while model generate text.")
-                raise exp
-            log_likelihood_response = self._get_log_likelihood(
-                response["results"][0]["input_tokens"], tokenized_context
-            )
-            results.append(log_likelihood_response)
-            self.cache_hook.add_partial(
-                "loglikelihood",
-                (context, continuation),
-                (
-                    log_likelihood_response.log_likelihood,
-                    log_likelihood_response.is_greedy,
-                ),
-            )
-        return cast(List[Tuple[float, bool]], results)
-    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
-        """
-        Used to evaluate perplexity on a data distribution.
-        Args:
-            requests: Each request contains Instance.args : Tuple[str] containing an input string to the model whose
-                entire loglikelihood, conditioned on purely the EOT token, will be calculated.
-        Returns:
-            Tuple (loglikelihood,) for each request according to the input order:
-                loglikelihood: solely the probability of producing each piece of text given no starting input.
-        """
-        try:
-            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
-        except ImportError:
-            raise ImportError(
-                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
-            )
-        self._check_model_logprobs_support()
-        generate_params = copy.deepcopy(self.generate_params)
-        generate_params[GenParams.MAX_NEW_TOKENS] = 1
-        requests = [request.args for request in requests]
-        results: List[LogLikelihoodResult] = []
-        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
-        for request in tqdm(
-            requests,
-            desc="Running loglikelihood_rolling function ...",
-        ):
-            context, continuation = request
-            try:
-                response = self.model.generate_text(
-                    prompt=context, params=generate_params, raw_response=True
-                )
-            except Exception as exp:
-                eval_logger.error("Error while model generate text.")
-                raise exp
-            log_likelihood_response = self._get_log_likelihood(
-                response["results"][0]["input_tokens"], []
-            )
-            results.append(log_likelihood_response)
-            self.cache_hook.add_partial(
-                "loglikelihood_rolling",
-                (context, continuation),
-                log_likelihood_response.log_likelihood,
-            )
-        return cast(List[Tuple[float, bool]], results)
-    @property
-    def tokenizer_name(self) -> str:
-        return ""
-    def apply_chat_template(
-        self, chat_history: List[Dict[str, str]]
-    ) -> List[Dict[str, str]]:
-        # A hack similar from api_model to allow encoding for cache
-        return JsonChatStr(json.dumps(chat_history))