Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md +8 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py +192 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json +1 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py +122 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py +161 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py +169 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py +325 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py +20 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py +36 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py +133 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py +233 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py +740 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py +155 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc +0 -0
- scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc +0 -0
scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (161 Bytes). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc
ADDED
|
Binary file (32.6 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc
ADDED
|
Binary file (8.35 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc
ADDED
|
Binary file (8.44 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (9.9 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# datasets
|
| 2 |
+
|
| 3 |
+
This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
|
| 4 |
+
|
| 5 |
+
__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py
ADDED
|
File without changes
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py
ADDED
|
File without changes
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (175 Bytes). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc
ADDED
|
Binary file (3.59 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
#
|
| 15 |
+
# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
|
| 16 |
+
# even if there are multiple types of answers for the same question.
|
| 17 |
+
"""DROP dataset."""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
|
| 23 |
+
import datasets
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
_CITATION = """\
|
| 27 |
+
@misc{dua2019drop,
|
| 28 |
+
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
|
| 29 |
+
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
|
| 30 |
+
year={2019},
|
| 31 |
+
eprint={1903.00161},
|
| 32 |
+
archivePrefix={arXiv},
|
| 33 |
+
primaryClass={cs.CL}
|
| 34 |
+
}
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
_DESCRIPTION = """\
|
| 38 |
+
DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
|
| 39 |
+
this crowdsourced, adversarially-created, 96k question-answering benchmark, a
|
| 40 |
+
system must resolve multiple references in a question, map them onto a paragraph,
|
| 41 |
+
and perform discrete operations over them (such as addition, counting, or sorting).
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
_HOMEPAGE = "https://allenai.org/data/drop"
|
| 45 |
+
|
| 46 |
+
# TODO: Add the licence for the dataset here if you can find it
|
| 47 |
+
_LICENSE = ""
|
| 48 |
+
|
| 49 |
+
_URLS = {
|
| 50 |
+
"drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
_EMPTY_VALIDATED_ANSWER = [
|
| 54 |
+
{
|
| 55 |
+
"number": "",
|
| 56 |
+
"date": {
|
| 57 |
+
"day": "",
|
| 58 |
+
"month": "",
|
| 59 |
+
"year": "",
|
| 60 |
+
},
|
| 61 |
+
"spans": [],
|
| 62 |
+
"worker_id": "",
|
| 63 |
+
"hit_id": "",
|
| 64 |
+
}
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class Drop(datasets.GeneratorBasedBuilder):
|
| 69 |
+
"""DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
|
| 70 |
+
|
| 71 |
+
VERSION = datasets.Version("0.0.1")
|
| 72 |
+
|
| 73 |
+
BUILDER_CONFIGS = [
|
| 74 |
+
datasets.BuilderConfig(
|
| 75 |
+
name="drop", version=VERSION, description="The DROP dataset."
|
| 76 |
+
),
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
def _info(self):
|
| 80 |
+
features = datasets.Features(
|
| 81 |
+
{
|
| 82 |
+
"section_id": datasets.Value("string"),
|
| 83 |
+
"passage": datasets.Value("string"),
|
| 84 |
+
"question": datasets.Value("string"),
|
| 85 |
+
"query_id": datasets.Value("string"),
|
| 86 |
+
"answer": {
|
| 87 |
+
"number": datasets.Value("string"),
|
| 88 |
+
"date": {
|
| 89 |
+
"day": datasets.Value("string"),
|
| 90 |
+
"month": datasets.Value("string"),
|
| 91 |
+
"year": datasets.Value("string"),
|
| 92 |
+
},
|
| 93 |
+
"spans": datasets.features.Sequence(datasets.Value("string")),
|
| 94 |
+
"worker_id": datasets.Value("string"),
|
| 95 |
+
"hit_id": datasets.Value("string"),
|
| 96 |
+
},
|
| 97 |
+
"validated_answers": datasets.features.Sequence(
|
| 98 |
+
{
|
| 99 |
+
"number": datasets.Value("string"),
|
| 100 |
+
"date": {
|
| 101 |
+
"day": datasets.Value("string"),
|
| 102 |
+
"month": datasets.Value("string"),
|
| 103 |
+
"year": datasets.Value("string"),
|
| 104 |
+
},
|
| 105 |
+
"spans": datasets.features.Sequence(datasets.Value("string")),
|
| 106 |
+
"worker_id": datasets.Value("string"),
|
| 107 |
+
"hit_id": datasets.Value("string"),
|
| 108 |
+
}
|
| 109 |
+
),
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
return datasets.DatasetInfo(
|
| 113 |
+
description=_DESCRIPTION,
|
| 114 |
+
features=features,
|
| 115 |
+
homepage=_HOMEPAGE,
|
| 116 |
+
license=_LICENSE,
|
| 117 |
+
citation=_CITATION,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def _split_generators(self, dl_manager):
|
| 121 |
+
urls = _URLS[self.config.name]
|
| 122 |
+
data_dir = dl_manager.download_and_extract(urls)
|
| 123 |
+
return [
|
| 124 |
+
datasets.SplitGenerator(
|
| 125 |
+
name=datasets.Split.TRAIN,
|
| 126 |
+
# These kwargs will be passed to _generate_examples
|
| 127 |
+
gen_kwargs={
|
| 128 |
+
"filepath": os.path.join(
|
| 129 |
+
data_dir, "drop_dataset", "drop_dataset_train.json"
|
| 130 |
+
),
|
| 131 |
+
"split": "train",
|
| 132 |
+
},
|
| 133 |
+
),
|
| 134 |
+
datasets.SplitGenerator(
|
| 135 |
+
name=datasets.Split.VALIDATION,
|
| 136 |
+
# These kwargs will be passed to _generate_examples
|
| 137 |
+
gen_kwargs={
|
| 138 |
+
"filepath": os.path.join(
|
| 139 |
+
data_dir, "drop_dataset", "drop_dataset_dev.json"
|
| 140 |
+
),
|
| 141 |
+
"split": "validation",
|
| 142 |
+
},
|
| 143 |
+
),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
| 147 |
+
def _generate_examples(self, filepath, split):
|
| 148 |
+
with open(filepath, encoding="utf-8") as f:
|
| 149 |
+
data = json.load(f)
|
| 150 |
+
key = 0
|
| 151 |
+
for section_id, example in data.items():
|
| 152 |
+
# Each example (passage) has multiple sub-question-answer pairs.
|
| 153 |
+
for qa in example["qa_pairs"]:
|
| 154 |
+
# Build answer.
|
| 155 |
+
answer = qa["answer"]
|
| 156 |
+
answer = {
|
| 157 |
+
"number": answer["number"],
|
| 158 |
+
"date": {
|
| 159 |
+
"day": answer["date"].get("day", ""),
|
| 160 |
+
"month": answer["date"].get("month", ""),
|
| 161 |
+
"year": answer["date"].get("year", ""),
|
| 162 |
+
},
|
| 163 |
+
"spans": answer["spans"],
|
| 164 |
+
"worker_id": answer.get("worker_id", ""),
|
| 165 |
+
"hit_id": answer.get("hit_id", ""),
|
| 166 |
+
}
|
| 167 |
+
validated_answers = []
|
| 168 |
+
if "validated_answers" in qa:
|
| 169 |
+
for validated_answer in qa["validated_answers"]:
|
| 170 |
+
va = {
|
| 171 |
+
"number": validated_answer.get("number", ""),
|
| 172 |
+
"date": {
|
| 173 |
+
"day": validated_answer["date"].get("day", ""),
|
| 174 |
+
"month": validated_answer["date"].get("month", ""),
|
| 175 |
+
"year": validated_answer["date"].get("year", ""),
|
| 176 |
+
},
|
| 177 |
+
"spans": validated_answer.get("spans", ""),
|
| 178 |
+
"worker_id": validated_answer.get("worker_id", ""),
|
| 179 |
+
"hit_id": validated_answer.get("hit_id", ""),
|
| 180 |
+
}
|
| 181 |
+
validated_answers.append(va)
|
| 182 |
+
else:
|
| 183 |
+
validated_answers = _EMPTY_VALIDATED_ANSWER
|
| 184 |
+
yield key, {
|
| 185 |
+
"section_id": section_id,
|
| 186 |
+
"passage": example["passage"],
|
| 187 |
+
"question": qa["question"],
|
| 188 |
+
"query_id": qa["query_id"],
|
| 189 |
+
"answer": answer,
|
| 190 |
+
"validated_answers": validated_answers,
|
| 191 |
+
}
|
| 192 |
+
key += 1
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py
ADDED
|
File without changes
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc
ADDED
|
Binary file (2.91 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}}
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""MATH dataset."""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import pathlib
|
| 20 |
+
|
| 21 |
+
import datasets
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
_CITATION = """\
|
| 25 |
+
@article{hendrycksmath2021,
|
| 26 |
+
title={Measuring Mathematical Problem Solving With the Math Dataset},
|
| 27 |
+
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
|
| 28 |
+
journal={NeurIPS},
|
| 29 |
+
year={2021}
|
| 30 |
+
}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
_DESCRIPTION = """\
|
| 34 |
+
MATH is a dataset of 12,500 challenging competition mathematics problems. Each
|
| 35 |
+
problem in Math has a full step-by-step solution which can be used to teach
|
| 36 |
+
models to generate answer derivations and explanations.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
_HOMEPAGE = "https://github.com/hendrycks/math"
|
| 40 |
+
|
| 41 |
+
# TODO: Add the licence for the dataset here if you can find it
|
| 42 |
+
_LICENSE = ""
|
| 43 |
+
|
| 44 |
+
_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
|
| 45 |
+
|
| 46 |
+
_NAMES = [
|
| 47 |
+
"algebra",
|
| 48 |
+
"counting_and_probability",
|
| 49 |
+
"geometry",
|
| 50 |
+
"intermediate_algebra",
|
| 51 |
+
"number_theory",
|
| 52 |
+
"prealgebra",
|
| 53 |
+
"precalculus",
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class HendrycksMath(datasets.GeneratorBasedBuilder):
|
| 58 |
+
"""MATH is a dataset of 12,500 challenging competition mathematics problems."""
|
| 59 |
+
|
| 60 |
+
VERSION = datasets.Version("0.0.1")
|
| 61 |
+
|
| 62 |
+
BUILDER_CONFIGS = [
|
| 63 |
+
datasets.BuilderConfig(name=name, version=version, description=name)
|
| 64 |
+
for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
def _info(self):
|
| 68 |
+
features = datasets.Features(
|
| 69 |
+
{
|
| 70 |
+
"problem": datasets.Value("string"),
|
| 71 |
+
"level": datasets.Value("string"),
|
| 72 |
+
"type": datasets.Value("string"),
|
| 73 |
+
"solution": datasets.Value("string"),
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
return datasets.DatasetInfo(
|
| 77 |
+
description=_DESCRIPTION,
|
| 78 |
+
features=features,
|
| 79 |
+
homepage=_HOMEPAGE,
|
| 80 |
+
license=_LICENSE,
|
| 81 |
+
citation=_CITATION,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def _split_generators(self, dl_manager):
|
| 85 |
+
urls = _URLS
|
| 86 |
+
data_dir = dl_manager.download_and_extract(urls)
|
| 87 |
+
return [
|
| 88 |
+
datasets.SplitGenerator(
|
| 89 |
+
name=datasets.Split.TRAIN,
|
| 90 |
+
# These kwargs will be passed to _generate_examples
|
| 91 |
+
gen_kwargs={
|
| 92 |
+
"basepath": os.path.join(
|
| 93 |
+
data_dir, "MATH", "train", self.config.name
|
| 94 |
+
),
|
| 95 |
+
"split": "train",
|
| 96 |
+
},
|
| 97 |
+
),
|
| 98 |
+
datasets.SplitGenerator(
|
| 99 |
+
name=datasets.Split.TEST,
|
| 100 |
+
# These kwargs will be passed to _generate_examples
|
| 101 |
+
gen_kwargs={
|
| 102 |
+
"basepath": os.path.join(
|
| 103 |
+
data_dir, "MATH", "test", self.config.name
|
| 104 |
+
),
|
| 105 |
+
"split": "test",
|
| 106 |
+
},
|
| 107 |
+
),
|
| 108 |
+
]
|
| 109 |
+
|
| 110 |
+
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
| 111 |
+
def _generate_examples(self, basepath, split):
|
| 112 |
+
key = 0
|
| 113 |
+
for file in sorted(pathlib.Path(basepath).iterdir()):
|
| 114 |
+
with open(file, "r", encoding="utf-8") as f:
|
| 115 |
+
data = json.load(f)
|
| 116 |
+
yield key, {
|
| 117 |
+
"problem": data["problem"],
|
| 118 |
+
"level": data["level"],
|
| 119 |
+
"type": data["type"],
|
| 120 |
+
"solution": data["solution"],
|
| 121 |
+
}
|
| 122 |
+
key += 1
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py
ADDED
|
File without changes
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zstandard
|
| 3 |
+
import json
|
| 4 |
+
import jsonlines
|
| 5 |
+
import io
|
| 6 |
+
import datetime
|
| 7 |
+
import mmap
|
| 8 |
+
import tqdm
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def json_serial(obj):
|
| 13 |
+
"""JSON serializer for objects not serializable by default json code"""
|
| 14 |
+
|
| 15 |
+
if isinstance(obj, (datetime.datetime,)):
|
| 16 |
+
return obj.isoformat()
|
| 17 |
+
raise TypeError("Type %s not serializable" % type(obj))
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Modified version of lm_dataformat Archive for single file.
|
| 21 |
+
class Archive:
|
| 22 |
+
def __init__(self, file_path, compression_level=3):
|
| 23 |
+
self.file_path = file_path
|
| 24 |
+
dir_name = os.path.dirname(file_path)
|
| 25 |
+
if dir_name:
|
| 26 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 27 |
+
self.fh = open(self.file_path, "wb")
|
| 28 |
+
self.cctx = zstandard.ZstdCompressor(level=compression_level)
|
| 29 |
+
self.compressor = self.cctx.stream_writer(self.fh)
|
| 30 |
+
|
| 31 |
+
def add_data(self, data, meta={}):
|
| 32 |
+
self.compressor.write(
|
| 33 |
+
json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
|
| 34 |
+
"UTF-8"
|
| 35 |
+
)
|
| 36 |
+
+ b"\n"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def commit(self):
|
| 40 |
+
self.compressor.flush(zstandard.FLUSH_FRAME)
|
| 41 |
+
self.fh.flush()
|
| 42 |
+
self.fh.close()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
|
| 46 |
+
class Reader:
|
| 47 |
+
def __init__(self):
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
|
| 51 |
+
with open(file, "rb") as fh:
|
| 52 |
+
self.fh = fh
|
| 53 |
+
cctx = zstandard.ZstdDecompressor()
|
| 54 |
+
reader = io.BufferedReader(cctx.stream_reader(fh))
|
| 55 |
+
rdr = jsonlines.Reader(reader)
|
| 56 |
+
for ob in rdr:
|
| 57 |
+
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
|
| 58 |
+
if isinstance(ob, str):
|
| 59 |
+
assert not get_meta
|
| 60 |
+
yield ob
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
text = ob["text"]
|
| 64 |
+
|
| 65 |
+
if autojoin_paragraphs and isinstance(text, list):
|
| 66 |
+
text = para_joiner.join(text)
|
| 67 |
+
|
| 68 |
+
if get_meta:
|
| 69 |
+
yield text, (ob["meta"] if "meta" in ob else {})
|
| 70 |
+
else:
|
| 71 |
+
yield text
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class TextArchive:
|
| 75 |
+
def __init__(self, file_path, mode="rb+"):
|
| 76 |
+
self.file_path = file_path
|
| 77 |
+
dir_name = os.path.dirname(file_path)
|
| 78 |
+
if dir_name:
|
| 79 |
+
os.makedirs(dir_name, exist_ok=True)
|
| 80 |
+
|
| 81 |
+
if not os.path.exists(file_path):
|
| 82 |
+
Path(file_path).touch()
|
| 83 |
+
|
| 84 |
+
self.fh = open(self.file_path, mode)
|
| 85 |
+
|
| 86 |
+
def add_data(self, data):
|
| 87 |
+
self.fh.write(data.encode("UTF-8") + b"\n")
|
| 88 |
+
|
| 89 |
+
def commit(self):
|
| 90 |
+
self.fh.flush()
|
| 91 |
+
self.fh.close()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class TextReader:
|
| 95 |
+
def __init__(self, file_path):
|
| 96 |
+
self.file_path = file_path
|
| 97 |
+
|
| 98 |
+
# Optimized mmap read with infrequent tqdm updates to maintain speed
|
| 99 |
+
# Tested up to 250MB/s.
|
| 100 |
+
def read_tqdm(self, update_frequency=10000):
|
| 101 |
+
current_file_position = 0
|
| 102 |
+
line_counter = 0
|
| 103 |
+
with open(self.file_path, "r") as fh, tqdm.tqdm(
|
| 104 |
+
total=os.path.getsize(self.file_path),
|
| 105 |
+
dynamic_ncols=True,
|
| 106 |
+
unit="byte",
|
| 107 |
+
unit_scale=1,
|
| 108 |
+
) as progress:
|
| 109 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 110 |
+
for line in iter(mmap_obj.readline, b""):
|
| 111 |
+
line = line.decode("utf-8")
|
| 112 |
+
line_counter += 1
|
| 113 |
+
if line_counter == update_frequency:
|
| 114 |
+
new_file_pos = mmap_obj.tell()
|
| 115 |
+
bytes_read = new_file_pos - current_file_position
|
| 116 |
+
current_file_position = new_file_pos
|
| 117 |
+
progress.update(bytes_read)
|
| 118 |
+
line_counter = 0
|
| 119 |
+
yield line[:-1]
|
| 120 |
+
|
| 121 |
+
def read_and_tell(self):
|
| 122 |
+
current_file_position = 0
|
| 123 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 124 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 125 |
+
for line in iter(mmap_obj.readline, b""):
|
| 126 |
+
line = line.decode("utf-8")
|
| 127 |
+
new_file_pos = mmap_obj.tell()
|
| 128 |
+
raw_bytes_read = new_file_pos - current_file_position
|
| 129 |
+
current_file_position = new_file_pos
|
| 130 |
+
yield line[:-1], raw_bytes_read
|
| 131 |
+
|
| 132 |
+
def read(self):
|
| 133 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 134 |
+
with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
|
| 135 |
+
for line in iter(mmap_obj.readline, b""):
|
| 136 |
+
line = line.decode("utf-8")
|
| 137 |
+
yield line[:-1]
|
| 138 |
+
|
| 139 |
+
def read_slow(self):
|
| 140 |
+
with open(self.file_path, "r", encoding="utf8") as fh:
|
| 141 |
+
while True:
|
| 142 |
+
line = fh.readline()
|
| 143 |
+
if line == -1 or line == "":
|
| 144 |
+
break
|
| 145 |
+
else:
|
| 146 |
+
yield line[:-1]
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# Optimized for speed. Decompresses the archive in shell before
|
| 150 |
+
# using the mmap'd TextReader.
|
| 151 |
+
class ZStdTextReader:
|
| 152 |
+
def __init__(self, file):
|
| 153 |
+
self.file = file
|
| 154 |
+
|
| 155 |
+
def read_tqdm(self):
|
| 156 |
+
decompressed_file = self.file[:-4]
|
| 157 |
+
print("Decompressing file, please wait...")
|
| 158 |
+
os.system(f"zstd -d {self.file}") # linux decompress is faster
|
| 159 |
+
reader = TextReader(decompressed_file)
|
| 160 |
+
yield from reader.read_tqdm()
|
| 161 |
+
os.remove(decompressed_file)
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import random
|
| 3 |
+
import pickle
|
| 4 |
+
import json
|
| 5 |
+
import glob
|
| 6 |
+
import os
|
| 7 |
+
import collections
|
| 8 |
+
|
| 9 |
+
from .janitor import Janitor, word_ngrams
|
| 10 |
+
from .archiver import ZStdTextReader
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Was used for testing the evaluator decoupled from the full logic below
|
| 14 |
+
def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
|
| 15 |
+
simulated_overlap = 0.1
|
| 16 |
+
contaminated = int(len(docs) * simulated_overlap)
|
| 17 |
+
return random.sample(range(len(docs)), contaminated)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Returns a dictionary containing all overlapping documents in each
|
| 21 |
+
# task. In the standard use case, an overlap occurs when any of the 13-grams
|
| 22 |
+
# found in the task document exist in the training set documents.
|
| 23 |
+
#
|
| 24 |
+
# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these
|
| 25 |
+
# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
|
| 26 |
+
# files. These should exist in the "ngrams_path" provided to this function.
|
| 27 |
+
|
| 28 |
+
# Algorithm:
|
| 29 |
+
# 1. Build lookups for each dataset {ngram: list(document_ids)}
|
| 30 |
+
# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
|
| 31 |
+
# 3. Full scan the 13-grams from the training set against the merged lookup,
|
| 32 |
+
# saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)}
|
| 33 |
+
# 4. Strip the task_set from the dictionary keys and return
|
| 34 |
+
#
|
| 35 |
+
# We cache the task+set lookups as well as the overlaps.
|
| 36 |
+
def get_train_overlap(docs_by_task_set, ngrams_path, limit):
|
| 37 |
+
# return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
|
| 38 |
+
|
| 39 |
+
info_dict_path = os.path.join(ngrams_path, "info.json")
|
| 40 |
+
info_dict = json.load(open(info_dict_path, "r"))
|
| 41 |
+
ngrams_n_size = info_dict["ngram_size"]
|
| 42 |
+
|
| 43 |
+
janitor = Janitor()
|
| 44 |
+
|
| 45 |
+
# Build lookup for each dataset first in case we use different task combinations later
|
| 46 |
+
print("Building Lookups...")
|
| 47 |
+
start = time.perf_counter()
|
| 48 |
+
|
| 49 |
+
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
|
| 50 |
+
return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
|
| 51 |
+
|
| 52 |
+
lookups = {}
|
| 53 |
+
duplicates = {} # (task_name, task_set): set(doc_ids)}
|
| 54 |
+
sets_to_decontaminate = len(docs_by_task_set.keys())
|
| 55 |
+
|
| 56 |
+
for (task_name, task_set), docs in docs_by_task_set.items():
|
| 57 |
+
if not os.path.exists(f"data/{task_name}"):
|
| 58 |
+
os.mkdir(f"data/{task_name}")
|
| 59 |
+
|
| 60 |
+
# Check if we've decontaminated this combination before
|
| 61 |
+
overlaps_dump_path = get_overlaps_dump_path(
|
| 62 |
+
task_name, task_set, ngrams_n_size, limit
|
| 63 |
+
)
|
| 64 |
+
if os.path.exists(overlaps_dump_path):
|
| 65 |
+
duplicates[(task_name, task_set)] = pickle.load(
|
| 66 |
+
open(overlaps_dump_path, "rb")
|
| 67 |
+
)
|
| 68 |
+
sets_to_decontaminate -= 1
|
| 69 |
+
continue
|
| 70 |
+
else:
|
| 71 |
+
duplicates[(task_name, task_set)] = set()
|
| 72 |
+
|
| 73 |
+
# Build/load the task lookup {ngram: set(documents)}.
|
| 74 |
+
task_set_lookup_path = (
|
| 75 |
+
f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
|
| 76 |
+
)
|
| 77 |
+
if os.path.exists(task_set_lookup_path):
|
| 78 |
+
print(f"{task_set_lookup_path} available, loading...")
|
| 79 |
+
lookups[(task_name, task_set)] = pickle.load(
|
| 80 |
+
open(task_set_lookup_path, "rb")
|
| 81 |
+
)
|
| 82 |
+
else:
|
| 83 |
+
print(f"{task_set_lookup_path} not available, building...")
|
| 84 |
+
lookup = collections.defaultdict(set)
|
| 85 |
+
|
| 86 |
+
for doc_id, document in enumerate(docs):
|
| 87 |
+
ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size)
|
| 88 |
+
for ngram in ngrams:
|
| 89 |
+
lookup[ngram].add(doc_id)
|
| 90 |
+
|
| 91 |
+
pickle.dump(lookup, open(task_set_lookup_path, "wb"))
|
| 92 |
+
lookups[(task_name, task_set)] = lookup
|
| 93 |
+
|
| 94 |
+
elapsed = time.perf_counter() - start
|
| 95 |
+
print(f"Building lookups took {elapsed:0.5f} seconds.")
|
| 96 |
+
|
| 97 |
+
matched_ngrams = []
|
| 98 |
+
|
| 99 |
+
if sets_to_decontaminate > 0:
|
| 100 |
+
print("Merging lookups...")
|
| 101 |
+
start = time.perf_counter()
|
| 102 |
+
merged_lookup = collections.defaultdict(list)
|
| 103 |
+
for (task_name, task_set), lookup in lookups.items():
|
| 104 |
+
for ngram, doc_ids in lookup.items():
|
| 105 |
+
merged_lookup[ngram].append((task_name, task_set, doc_ids))
|
| 106 |
+
|
| 107 |
+
elapsed = time.perf_counter() - start
|
| 108 |
+
print(f"Merging lookups took {elapsed:0.5f} seconds.")
|
| 109 |
+
|
| 110 |
+
print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
|
| 111 |
+
files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
|
| 112 |
+
print(files)
|
| 113 |
+
|
| 114 |
+
for file in files:
|
| 115 |
+
start = time.perf_counter()
|
| 116 |
+
print(f"Scanning {file}")
|
| 117 |
+
reader = ZStdTextReader(file)
|
| 118 |
+
total_ngrams = 0
|
| 119 |
+
unique_ngrams = 0
|
| 120 |
+
matching_unique = 0
|
| 121 |
+
non_matching_unique = 0
|
| 122 |
+
|
| 123 |
+
current_ngram = ""
|
| 124 |
+
for line in reader.read_tqdm(): # Scan training set ngrams file
|
| 125 |
+
total_ngrams += 1
|
| 126 |
+
[ngram, document_id] = line.rsplit(" ", 1)
|
| 127 |
+
if (
|
| 128 |
+
ngram != current_ngram
|
| 129 |
+
): # Only need to match the ngram once in training set
|
| 130 |
+
unique_ngrams += 1
|
| 131 |
+
current_ngram = ngram
|
| 132 |
+
if ngram in merged_lookup:
|
| 133 |
+
matched_ngrams.append(ngram) # For logging
|
| 134 |
+
matching_unique += 1
|
| 135 |
+
for task_name, task_set, doc_ids in merged_lookup[ngram]:
|
| 136 |
+
task_doc_set = duplicates[(task_name, task_set)]
|
| 137 |
+
for (
|
| 138 |
+
doc_id
|
| 139 |
+
) in (
|
| 140 |
+
doc_ids
|
| 141 |
+
): # Record contamination across all relevant task/set combos
|
| 142 |
+
task_doc_set.add(doc_id)
|
| 143 |
+
del merged_lookup[ngram] # No point matching again
|
| 144 |
+
else:
|
| 145 |
+
non_matching_unique += 1
|
| 146 |
+
|
| 147 |
+
print(f"Total Ngrams: {total_ngrams}")
|
| 148 |
+
print(f"Unique Ngrams: {unique_ngrams}")
|
| 149 |
+
print(f"Unique Matching: {matching_unique}")
|
| 150 |
+
print(f"Unique Non Matching: {non_matching_unique}")
|
| 151 |
+
print("Matched ngrams:")
|
| 152 |
+
for ngram in matched_ngrams:
|
| 153 |
+
print(ngram)
|
| 154 |
+
|
| 155 |
+
elapsed = time.perf_counter() - start
|
| 156 |
+
print(f"Read took {elapsed:0.5f} seconds.")
|
| 157 |
+
print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second")
|
| 158 |
+
|
| 159 |
+
print(duplicates)
|
| 160 |
+
|
| 161 |
+
# Dump overlaps separately
|
| 162 |
+
for (task_name, task_set), doc_ids in duplicates.items():
|
| 163 |
+
overlaps_dump_path = get_overlaps_dump_path(
|
| 164 |
+
task_name, task_set, ngrams_n_size, limit
|
| 165 |
+
)
|
| 166 |
+
pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
|
| 167 |
+
|
| 168 |
+
# Strip task set and return
|
| 169 |
+
return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import timeit
|
| 4 |
+
import pickle
|
| 5 |
+
import traceback
|
| 6 |
+
from pprint import pprint
|
| 7 |
+
|
| 8 |
+
# This is a cpp module. Compile janitor_util.cpp with:
|
| 9 |
+
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
|
| 10 |
+
try:
|
| 11 |
+
import janitor_util
|
| 12 |
+
|
| 13 |
+
JANITOR_CPP = True
|
| 14 |
+
except Exception:
|
| 15 |
+
print("WARNING: C++ module could not be loaded. Janitor running in python mode")
|
| 16 |
+
traceback.print_exc()
|
| 17 |
+
JANITOR_CPP = False
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Implementation from nltk source
|
| 21 |
+
# https://www.nltk.org/_modules/nltk/util.html
|
| 22 |
+
def form_ngrams(sequence, n):
|
| 23 |
+
history = []
|
| 24 |
+
while n > 1:
|
| 25 |
+
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
|
| 26 |
+
try:
|
| 27 |
+
next_item = next(sequence)
|
| 28 |
+
except StopIteration:
|
| 29 |
+
# no more data, terminate the generator
|
| 30 |
+
return
|
| 31 |
+
history.append(next_item)
|
| 32 |
+
n -= 1
|
| 33 |
+
for item in sequence:
|
| 34 |
+
history.append(item)
|
| 35 |
+
yield tuple(history)
|
| 36 |
+
del history[0]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def word_ngrams(s, n):
|
| 40 |
+
"""Splits a string into ngram words"""
|
| 41 |
+
tokens = s.split() # not a generator :(
|
| 42 |
+
ngram_seqs = form_ngrams(iter(tokens), n)
|
| 43 |
+
return (" ".join(ngram) for ngram in ngram_seqs)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Does character sequences only - combined faster function to play around with later
|
| 47 |
+
# def word_ngrams_indices_combined(sequence, n):
|
| 48 |
+
# current_word = ""
|
| 49 |
+
# history = []
|
| 50 |
+
# gap = False;
|
| 51 |
+
# start = 0
|
| 52 |
+
# end = 0
|
| 53 |
+
# for character in sequence:
|
| 54 |
+
# if character == " ":
|
| 55 |
+
# if not gap:
|
| 56 |
+
# gap = True
|
| 57 |
+
# history.append(current_word)
|
| 58 |
+
# end += len(current_word) - 1
|
| 59 |
+
# current_word = ""
|
| 60 |
+
# if len(history) == n:
|
| 61 |
+
# yield (tuple(history), start, end)
|
| 62 |
+
# del history[0]
|
| 63 |
+
# start = end + 1
|
| 64 |
+
# end = start
|
| 65 |
+
# else:
|
| 66 |
+
# gap = False
|
| 67 |
+
# current_word += character
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
|
| 71 |
+
def split_indices(s):
|
| 72 |
+
"""Splits a string on whitespaces and records the indices of each in the original string.
|
| 73 |
+
@:return generator((word, (start_idx, end_idx)), ...)
|
| 74 |
+
"""
|
| 75 |
+
return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def word_ngrams_indices(s, n):
|
| 79 |
+
"""Splits a string into pairs of (ngram words, their start/end indices)"""
|
| 80 |
+
tokens_with_indices = split_indices(s)
|
| 81 |
+
|
| 82 |
+
# Generator of ngrams of (word, idx_pairs)
|
| 83 |
+
# (
|
| 84 |
+
# [(word, (start,end)), (word, (start, end))...],
|
| 85 |
+
# [(word, (start, end)), ...],
|
| 86 |
+
# ...
|
| 87 |
+
# )
|
| 88 |
+
ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)
|
| 89 |
+
|
| 90 |
+
# Generator of pairs of word and index ngrams
|
| 91 |
+
# (
|
| 92 |
+
# ([word, word, ...], [(start,end), (start,end), ...]),
|
| 93 |
+
# ...
|
| 94 |
+
# )
|
| 95 |
+
ngram_indices_pairs = (
|
| 96 |
+
zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
|
| 100 |
+
return (
|
| 101 |
+
(" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
|
| 102 |
+
for ngram_seq, indices in ngram_indices_pairs
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class Janitor:
|
| 107 |
+
|
| 108 |
+
# FIXME delete_chars: Should anything else go here? Special chars?
|
| 109 |
+
def __init__(
|
| 110 |
+
self,
|
| 111 |
+
ngram_n=13,
|
| 112 |
+
window_to_remove=200,
|
| 113 |
+
too_dirty_cutoff=10,
|
| 114 |
+
minimum_slice_length=200,
|
| 115 |
+
delete_chars=string.punctuation,
|
| 116 |
+
):
|
| 117 |
+
self.ngram_n = ngram_n
|
| 118 |
+
self.window_to_remove = window_to_remove
|
| 119 |
+
self.too_dirty_cutoff = too_dirty_cutoff
|
| 120 |
+
self.minimum_slice_length = minimum_slice_length
|
| 121 |
+
self.delete_chars = delete_chars
|
| 122 |
+
|
| 123 |
+
self.dirt_ngrams = set()
|
| 124 |
+
|
| 125 |
+
# If in python, we'll translate uppercase to lowercase and delete naughty characters.
|
| 126 |
+
# This is fast by python standards
|
| 127 |
+
# https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
|
| 128 |
+
self.translation_table = str.maketrans(
|
| 129 |
+
string.ascii_lowercase + string.ascii_uppercase, # These characters
|
| 130 |
+
string.ascii_lowercase * 2, # Become these characters
|
| 131 |
+
self.delete_chars, # These are deleted
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
##############
|
| 135 |
+
# I/O for saving contamination ngrams
|
| 136 |
+
##############
|
| 137 |
+
|
| 138 |
+
def save_contamination_ngrams(self, filename):
|
| 139 |
+
with open(filename, "wb") as fp:
|
| 140 |
+
pickle.dump(filename, fp)
|
| 141 |
+
|
| 142 |
+
def load_contamination_ngrams(self, filename):
|
| 143 |
+
with open(filename, "rb") as fp:
|
| 144 |
+
self.dirt_ngrams = pickle.load(fp)
|
| 145 |
+
|
| 146 |
+
##############
|
| 147 |
+
# Call these :)
|
| 148 |
+
##############
|
| 149 |
+
|
| 150 |
+
def register_contaminant(self, dirt_string):
|
| 151 |
+
"""Register a string as contamination to be removed, e.g. a test set
|
| 152 |
+
This breaks the dirt_string into ngrams to store for future cleaning"""
|
| 153 |
+
if JANITOR_CPP:
|
| 154 |
+
return self.register_contaminant_cpp(dirt_string)
|
| 155 |
+
else:
|
| 156 |
+
print("WARNING: Janitor running in python mode")
|
| 157 |
+
return self.register_contaminant_python(dirt_string)
|
| 158 |
+
|
| 159 |
+
def clean(self, dirty_string):
|
| 160 |
+
"""Clean a string (e.g. a training set) by removing all ngrams previously
|
| 161 |
+
registered as contaminants. Returns a list of clean chunks, or empty if
|
| 162 |
+
the string was too dirty"""
|
| 163 |
+
if JANITOR_CPP:
|
| 164 |
+
return self.clean_cpp(dirty_string)
|
| 165 |
+
else:
|
| 166 |
+
print("WARNING: Janitor running in python mode")
|
| 167 |
+
return self.clean_python(dirty_string)
|
| 168 |
+
|
| 169 |
+
def _split_chunks(self, dirty_string, dirty_parts):
|
| 170 |
+
clean_chunks = []
|
| 171 |
+
splice_idx = 0
|
| 172 |
+
end = -1
|
| 173 |
+
for i, (ngram, start, end) in enumerate(dirty_parts):
|
| 174 |
+
if i >= self.too_dirty_cutoff:
|
| 175 |
+
return []
|
| 176 |
+
start = max(0, start - self.window_to_remove)
|
| 177 |
+
end = min(len(dirty_string), end + self.window_to_remove)
|
| 178 |
+
|
| 179 |
+
if start - splice_idx > self.minimum_slice_length:
|
| 180 |
+
clean_chunks.append(dirty_string[splice_idx:start])
|
| 181 |
+
splice_idx = end
|
| 182 |
+
|
| 183 |
+
if end < len(dirty_string) - self.minimum_slice_length:
|
| 184 |
+
clean_chunks.append(dirty_string[end + 1 :])
|
| 185 |
+
|
| 186 |
+
return clean_chunks
|
| 187 |
+
|
| 188 |
+
##############
|
| 189 |
+
# Fast C++
|
| 190 |
+
##############
|
| 191 |
+
|
| 192 |
+
def register_contaminant_cpp(self, dirt_string):
|
| 193 |
+
self.dirt_ngrams.update(
|
| 194 |
+
janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
def clean_cpp(self, dirty_string):
|
| 198 |
+
contamination_indices = janitor_util.clean_ngram_with_indices(
|
| 199 |
+
dirty_string, self.delete_chars, self.ngram_n
|
| 200 |
+
)
|
| 201 |
+
return self._split_chunks(dirty_string, contamination_indices)
|
| 202 |
+
|
| 203 |
+
##############
|
| 204 |
+
# Slow python
|
| 205 |
+
##############
|
| 206 |
+
|
| 207 |
+
def normalize_string(self, s):
|
| 208 |
+
return s.translate(self.translation_table)
|
| 209 |
+
|
| 210 |
+
def register_contaminant_python(self, dirt_string):
|
| 211 |
+
self.dirt_ngrams.update(
|
| 212 |
+
word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
def clean_python(self, dirty_string):
|
| 216 |
+
contamination_indices = (
|
| 217 |
+
(None, *idx_pair)
|
| 218 |
+
for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
|
| 219 |
+
if self.normalize_string(dirty_ngram) in self.dirt_ngrams
|
| 220 |
+
)
|
| 221 |
+
return self._split_chunks(dirty_string, contamination_indices)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
##################################################################
|
| 225 |
+
# Tests
|
| 226 |
+
#################################################################
|
| 227 |
+
|
| 228 |
+
# def print_cpp():
|
| 229 |
+
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
|
| 230 |
+
|
| 231 |
+
# for i in range(1, 10, 2):
|
| 232 |
+
# pprint(janitor_util.clean_ngram(source, string.punctuation, i))
|
| 233 |
+
# for ngram, start, end in \
|
| 234 |
+
# janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
|
| 235 |
+
# print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# def test_cpp():
|
| 239 |
+
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
|
| 240 |
+
# contaminant = "dirty boy. Clean he he"
|
| 241 |
+
|
| 242 |
+
# jan_python = Janitor()
|
| 243 |
+
# jan_cpp = Janitor()
|
| 244 |
+
|
| 245 |
+
# jan_python.register_contaminant_python(contaminant)
|
| 246 |
+
# jan_cpp.register_contaminant(contaminant)
|
| 247 |
+
|
| 248 |
+
# assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
|
| 249 |
+
|
| 250 |
+
# assert jan_python.clean_python(source) == jan_cpp.clean(source), \
|
| 251 |
+
# (jan_python.clean_python(source), jan_cpp.clean(source))
|
| 252 |
+
|
| 253 |
+
# print("Passed test, python==cpp")
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# def benchmark():
|
| 257 |
+
# # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
|
| 258 |
+
# setup = \
|
| 259 |
+
# """
|
| 260 |
+
# with open("data/enwik8", "r") as f:
|
| 261 |
+
# data = f.read()
|
| 262 |
+
# jan = Janitor(too_dirty_cutoff=1000)
|
| 263 |
+
# jan.register_contaminant('''
|
| 264 |
+
# theories is that there is a connection between "geekdom" and autism.
|
| 265 |
+
# This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled "
|
| 266 |
+
# The [[Geek]] Syndrome", which is a point argued by many in the autism rights
|
| 267 |
+
# movement{{ref|Wired}}. This article, many professionals assert, is just one example of
|
| 268 |
+
# the media's application of mental disease labels to what is actually variant normal behavior
|
| 269 |
+
# &mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
|
| 270 |
+
# interests, even when they seem unusual to others, are not in themselves signs of autism or
|
| 271 |
+
# Asperger's syndrome. Others assert that it is actually the medical profession which is applying
|
| 272 |
+
# mental disease labels to children who in the past would have simply been accepted as a little
|
| 273 |
+
# different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
|
| 274 |
+
# Due to the recent publicity surrounding autism and autis
|
| 275 |
+
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
|
| 276 |
+
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
|
| 277 |
+
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
|
| 278 |
+
# would last, took a cautious approach, preferring to save the revenue rather than investing it in
|
| 279 |
+
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
|
| 280 |
+
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
|
| 281 |
+
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
|
| 282 |
+
# with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M,
|
| 283 |
+
# ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
|
| 284 |
+
# ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
|
| 285 |
+
# Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
|
| 286 |
+
# [[United Arab Emirates]]. After the Emirates gained independence in 1971,
|
| 287 |
+
# ''')
|
| 288 |
+
# """
|
| 289 |
+
|
| 290 |
+
# n = 1
|
| 291 |
+
# print(f"Timing {n} run on 100 MB")
|
| 292 |
+
# print("Register contaminant")
|
| 293 |
+
# # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
|
| 294 |
+
# print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
|
| 295 |
+
|
| 296 |
+
# print("Clean")
|
| 297 |
+
# # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
|
| 298 |
+
# print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
# def test_janitor_general():
|
| 302 |
+
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
|
| 303 |
+
# contaminant = "dirty boy. Clean he he"
|
| 304 |
+
|
| 305 |
+
# jan = Janitor(ngram_n=3)
|
| 306 |
+
# jan.register_contaminant(contaminant)
|
| 307 |
+
# cleaned = " ".join(jan.clean(source))
|
| 308 |
+
# for contam in jan.dirt_ngrams:
|
| 309 |
+
# assert contam not in cleaned, contam
|
| 310 |
+
|
| 311 |
+
# filename = "data/saved_contam"
|
| 312 |
+
# jan.save_contamination_ngrams(filename)
|
| 313 |
+
|
| 314 |
+
# jan = Janitor(ngram_n=3)
|
| 315 |
+
# jan.load_contamination_ngrams(filename)
|
| 316 |
+
# cleaned = " ".join(jan.clean(source))
|
| 317 |
+
# for contam in jan.dirt_ngrams:
|
| 318 |
+
# assert contam not in cleaned, contam
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
# if __name__ == "__main__":
|
| 322 |
+
# test()
|
| 323 |
+
# # print_cpp()
|
| 324 |
+
# # test_cpp()
|
| 325 |
+
# # benchmark()
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from . import gpt2
|
| 2 |
+
from . import gpt3
|
| 3 |
+
from . import huggingface
|
| 4 |
+
from . import textsynth
|
| 5 |
+
from . import dummy
|
| 6 |
+
|
| 7 |
+
MODEL_REGISTRY = {
|
| 8 |
+
"hf": gpt2.HFLM,
|
| 9 |
+
"hf-causal": gpt2.HFLM,
|
| 10 |
+
"hf-causal-experimental": huggingface.AutoCausalLM,
|
| 11 |
+
"hf-seq2seq": huggingface.AutoSeq2SeqLM,
|
| 12 |
+
"gpt2": gpt2.GPT2LM,
|
| 13 |
+
"gpt3": gpt3.GPT3LM,
|
| 14 |
+
"textsynth": textsynth.TextSynthLM,
|
| 15 |
+
"dummy": dummy.DummyLM,
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_model(model_name):
|
| 20 |
+
return MODEL_REGISTRY[model_name]
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (655 Bytes). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc
ADDED
|
Binary file (1.3 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc
ADDED
|
Binary file (3.58 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc
ADDED
|
Binary file (6.22 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc
ADDED
|
Binary file (22.4 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc
ADDED
|
Binary file (4.5 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from lm_eval.base import LM
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class DummyLM(LM):
|
| 6 |
+
def __init__(self):
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
@classmethod
|
| 10 |
+
def create_from_arg_string(cls, arg_string, additional_config=None):
|
| 11 |
+
return cls()
|
| 12 |
+
|
| 13 |
+
def loglikelihood(self, requests):
|
| 14 |
+
res = []
|
| 15 |
+
|
| 16 |
+
for _ in requests:
|
| 17 |
+
res.append((-random.random(), False))
|
| 18 |
+
|
| 19 |
+
return res
|
| 20 |
+
|
| 21 |
+
def greedy_until(self, requests):
|
| 22 |
+
res = []
|
| 23 |
+
|
| 24 |
+
for ctx, _ in requests:
|
| 25 |
+
res.append("lol")
|
| 26 |
+
assert ctx.strip() != ""
|
| 27 |
+
|
| 28 |
+
return res
|
| 29 |
+
|
| 30 |
+
def loglikelihood_rolling(self, requests):
|
| 31 |
+
res = []
|
| 32 |
+
|
| 33 |
+
for _ in requests:
|
| 34 |
+
res.append(-random.random())
|
| 35 |
+
|
| 36 |
+
return res
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import transformers
|
| 3 |
+
from typing import Optional, Union
|
| 4 |
+
from lm_eval.base import BaseLM
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class HFLM(BaseLM):
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
device="cuda",
|
| 11 |
+
pretrained="gpt2",
|
| 12 |
+
revision="main",
|
| 13 |
+
low_cpu_mem_usage=None,
|
| 14 |
+
torch_dtype=None,
|
| 15 |
+
device_map=None,
|
| 16 |
+
subfolder=None,
|
| 17 |
+
tokenizer=None,
|
| 18 |
+
batch_size=1,
|
| 19 |
+
load_in_8bit: Optional[bool] = False,
|
| 20 |
+
trust_remote_code: Optional[bool] = False,
|
| 21 |
+
use_fast: Optional[bool] = True,
|
| 22 |
+
):
|
| 23 |
+
super().__init__()
|
| 24 |
+
|
| 25 |
+
assert isinstance(device, str)
|
| 26 |
+
assert isinstance(pretrained, str)
|
| 27 |
+
assert isinstance(batch_size, int)
|
| 28 |
+
|
| 29 |
+
if device:
|
| 30 |
+
if device not in ["cuda", "cpu"]:
|
| 31 |
+
device = int(device)
|
| 32 |
+
self._device = torch.device(device)
|
| 33 |
+
print(f"Using device '{device}'")
|
| 34 |
+
else:
|
| 35 |
+
print("Device not specified")
|
| 36 |
+
print(f"Cuda Available? {torch.cuda.is_available()}")
|
| 37 |
+
self._device = (
|
| 38 |
+
torch.device("cuda")
|
| 39 |
+
if torch.cuda.is_available()
|
| 40 |
+
else torch.device("cpu")
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# TODO: update this to be less of a hack once subfolder is fixed in HF
|
| 44 |
+
revision = revision + ("/" + subfolder if subfolder is not None else "")
|
| 45 |
+
|
| 46 |
+
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
|
| 47 |
+
pretrained,
|
| 48 |
+
load_in_8bit=load_in_8bit,
|
| 49 |
+
low_cpu_mem_usage=low_cpu_mem_usage,
|
| 50 |
+
torch_dtype=torch_dtype,
|
| 51 |
+
device_map=device_map,
|
| 52 |
+
revision=revision,
|
| 53 |
+
trust_remote_code=trust_remote_code,
|
| 54 |
+
).eval()
|
| 55 |
+
if not load_in_8bit:
|
| 56 |
+
try:
|
| 57 |
+
self.gpt2.to(self.device)
|
| 58 |
+
except: # noqa: E722
|
| 59 |
+
print(
|
| 60 |
+
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
|
| 61 |
+
)
|
| 62 |
+
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
|
| 63 |
+
pretrained if tokenizer is None else tokenizer,
|
| 64 |
+
revision=revision,
|
| 65 |
+
trust_remote_code=trust_remote_code,
|
| 66 |
+
use_fast=use_fast,
|
| 67 |
+
)
|
| 68 |
+
self.vocab_size = self.tokenizer.vocab_size
|
| 69 |
+
|
| 70 |
+
# multithreading and batching
|
| 71 |
+
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
|
| 72 |
+
|
| 73 |
+
# TODO: fix multi-gpu
|
| 74 |
+
# gpus = torch.cuda.device_count()
|
| 75 |
+
# if gpus > 1:
|
| 76 |
+
# self.gpt2 = nn.DataParallel(self.gpt2)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def eot_token_id(self):
|
| 80 |
+
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
|
| 81 |
+
return self.tokenizer.eos_token_id
|
| 82 |
+
|
| 83 |
+
@property
|
| 84 |
+
def max_length(self):
|
| 85 |
+
try:
|
| 86 |
+
return self.gpt2.config.n_ctx
|
| 87 |
+
except AttributeError:
|
| 88 |
+
# gptneoconfig doesn't have n_ctx apparently
|
| 89 |
+
return self.gpt2.config.max_position_embeddings
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def max_gen_toks(self):
|
| 93 |
+
return 256
|
| 94 |
+
|
| 95 |
+
@property
|
| 96 |
+
def batch_size(self):
|
| 97 |
+
# TODO: fix multi-gpu
|
| 98 |
+
return self.batch_size_per_gpu # * gpus
|
| 99 |
+
|
| 100 |
+
@property
|
| 101 |
+
def device(self):
|
| 102 |
+
# TODO: fix multi-gpu
|
| 103 |
+
return self._device
|
| 104 |
+
|
| 105 |
+
def tok_encode(self, string: str):
|
| 106 |
+
return self.tokenizer.encode(string, add_special_tokens=False)
|
| 107 |
+
|
| 108 |
+
def tok_decode(self, tokens):
|
| 109 |
+
return self.tokenizer.decode(tokens)
|
| 110 |
+
|
| 111 |
+
def _model_call(self, inps):
|
| 112 |
+
"""
|
| 113 |
+
inps: a torch tensor of shape [batch, sequence]
|
| 114 |
+
the size of sequence may vary from call to call
|
| 115 |
+
|
| 116 |
+
returns: a torch tensor of shape [batch, sequence, vocab] with the
|
| 117 |
+
logits returned from the model
|
| 118 |
+
"""
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
return self.gpt2(inps)[0]
|
| 121 |
+
|
| 122 |
+
def _model_generate(self, context, max_length, eos_token_id):
|
| 123 |
+
return self.gpt2.generate(
|
| 124 |
+
context,
|
| 125 |
+
max_length=max_length,
|
| 126 |
+
eos_token_id=eos_token_id,
|
| 127 |
+
pad_token_id=eos_token_id,
|
| 128 |
+
do_sample=False,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# for backwards compatibility
|
| 133 |
+
GPT2LM = HFLM
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import transformers
|
| 4 |
+
from lm_eval.base import BaseLM
|
| 5 |
+
from lm_eval import utils
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_result(response, ctxlen):
|
| 11 |
+
"""Process results from OpenAI API response.
|
| 12 |
+
|
| 13 |
+
:param response: dict
|
| 14 |
+
OpenAI API Response
|
| 15 |
+
:param ctxlen: int
|
| 16 |
+
Length of context (so we can slice them away and only keep the predictions)
|
| 17 |
+
:return:
|
| 18 |
+
continuation_logprobs: np.array
|
| 19 |
+
Log probabilities of continuation tokens
|
| 20 |
+
is_greedy: bool
|
| 21 |
+
whether argmax matches given continuation exactly
|
| 22 |
+
"""
|
| 23 |
+
is_greedy = True
|
| 24 |
+
logprobs = response["logprobs"]["token_logprobs"]
|
| 25 |
+
continuation_logprobs = sum(logprobs[ctxlen:])
|
| 26 |
+
|
| 27 |
+
for i in range(ctxlen, len(response["logprobs"]["tokens"])):
|
| 28 |
+
token = response["logprobs"]["tokens"][i]
|
| 29 |
+
top_tokens = response["logprobs"]["top_logprobs"][i]
|
| 30 |
+
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
|
| 31 |
+
if top_token != token:
|
| 32 |
+
is_greedy = False
|
| 33 |
+
break
|
| 34 |
+
|
| 35 |
+
return continuation_logprobs, is_greedy
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def oa_completion(**kwargs):
|
| 39 |
+
"""Query OpenAI API for completion.
|
| 40 |
+
|
| 41 |
+
Retry with back-off until they respond
|
| 42 |
+
"""
|
| 43 |
+
import openai
|
| 44 |
+
|
| 45 |
+
backoff_time = 3
|
| 46 |
+
while True:
|
| 47 |
+
try:
|
| 48 |
+
return openai.Completion.create(**kwargs)
|
| 49 |
+
except openai.error.OpenAIError:
|
| 50 |
+
import traceback
|
| 51 |
+
|
| 52 |
+
traceback.print_exc()
|
| 53 |
+
time.sleep(backoff_time)
|
| 54 |
+
backoff_time *= 1.5
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class GPT3LM(BaseLM):
|
| 58 |
+
REQ_CHUNK_SIZE = 20
|
| 59 |
+
|
| 60 |
+
def __init__(self, engine, truncate=False):
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
:param engine: str
|
| 64 |
+
OpenAI API engine (e.g. davinci)
|
| 65 |
+
:param truncate: bool
|
| 66 |
+
Truncate input if too long (if False and input is too long, throw error)
|
| 67 |
+
"""
|
| 68 |
+
super().__init__()
|
| 69 |
+
|
| 70 |
+
import openai
|
| 71 |
+
|
| 72 |
+
self.engine = engine
|
| 73 |
+
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
|
| 74 |
+
|
| 75 |
+
self.vocab_size = self.tokenizer.vocab_size
|
| 76 |
+
|
| 77 |
+
# to make the annoying "Using pad_token, but it is not set yet." error go away
|
| 78 |
+
self.tokenizer.pad_token = "<|endoftext|>"
|
| 79 |
+
assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373]
|
| 80 |
+
self.truncate = truncate
|
| 81 |
+
self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(
|
| 82 |
+
["<|endoftext|>"]
|
| 83 |
+
)[0]
|
| 84 |
+
|
| 85 |
+
# Read from environment variable OPENAI_API_SECRET_KEY
|
| 86 |
+
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
|
| 87 |
+
|
| 88 |
+
@property
|
| 89 |
+
def eot_token_id(self):
|
| 90 |
+
return self.tokenizer.eos_token_id
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def max_length(self):
|
| 94 |
+
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
|
| 95 |
+
return 2048
|
| 96 |
+
|
| 97 |
+
@property
|
| 98 |
+
def max_gen_toks(self):
|
| 99 |
+
return 256
|
| 100 |
+
|
| 101 |
+
@property
|
| 102 |
+
def batch_size(self):
|
| 103 |
+
# Isn't used because we override _loglikelihood_tokens
|
| 104 |
+
raise NotImplementedError()
|
| 105 |
+
|
| 106 |
+
@property
|
| 107 |
+
def device(self):
|
| 108 |
+
# Isn't used because we override _loglikelihood_tokens
|
| 109 |
+
raise NotImplementedError()
|
| 110 |
+
|
| 111 |
+
def tok_encode(self, string: str):
|
| 112 |
+
return self.tokenizer.encode(string, add_special_tokens=False)
|
| 113 |
+
|
| 114 |
+
def tok_decode(self, tokens):
|
| 115 |
+
return self.tokenizer.decode(tokens)
|
| 116 |
+
|
| 117 |
+
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
|
| 118 |
+
res = []
|
| 119 |
+
|
| 120 |
+
def _collate(x):
|
| 121 |
+
# this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
|
| 122 |
+
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
|
| 123 |
+
# we care about and so we need some kind of backup for when it isn't
|
| 124 |
+
toks = x[1] + x[2]
|
| 125 |
+
return -len(toks), tuple(toks)
|
| 126 |
+
|
| 127 |
+
re_ord = utils.Reorderer(requests, _collate)
|
| 128 |
+
|
| 129 |
+
for chunk in tqdm(
|
| 130 |
+
list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
|
| 131 |
+
disable=disable_tqdm,
|
| 132 |
+
):
|
| 133 |
+
inps = []
|
| 134 |
+
ctxlens = []
|
| 135 |
+
for cache_key, context_enc, continuation_enc in chunk:
|
| 136 |
+
# max_length+1 because the API takes up to 2049 tokens, including the first context token
|
| 137 |
+
inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
|
| 138 |
+
# TODO: the logic is much simpler if we just look at the length of continuation tokens
|
| 139 |
+
ctxlen = len(context_enc) - max(
|
| 140 |
+
0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
inps.append(inp)
|
| 144 |
+
ctxlens.append(ctxlen)
|
| 145 |
+
|
| 146 |
+
response = oa_completion(
|
| 147 |
+
engine=self.engine,
|
| 148 |
+
prompt=inps,
|
| 149 |
+
echo=True,
|
| 150 |
+
max_tokens=0,
|
| 151 |
+
temperature=0.0,
|
| 152 |
+
logprobs=10,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
|
| 156 |
+
response.choices, ctxlens, chunk
|
| 157 |
+
):
|
| 158 |
+
answer = get_result(resp, ctxlen)
|
| 159 |
+
|
| 160 |
+
res.append(answer)
|
| 161 |
+
|
| 162 |
+
# partial caching
|
| 163 |
+
if cache_key is not None:
|
| 164 |
+
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
|
| 165 |
+
|
| 166 |
+
return re_ord.get_original(res)
|
| 167 |
+
|
| 168 |
+
def greedy_until(self, requests):
|
| 169 |
+
if not requests:
|
| 170 |
+
return []
|
| 171 |
+
res = []
|
| 172 |
+
|
| 173 |
+
def _collate(x):
|
| 174 |
+
toks = self.tok_encode(x[0])
|
| 175 |
+
return len(toks), x[0]
|
| 176 |
+
|
| 177 |
+
re_ord = utils.Reorderer(requests, _collate)
|
| 178 |
+
|
| 179 |
+
def sameuntil_chunks(xs, size):
|
| 180 |
+
ret = []
|
| 181 |
+
lastuntil = xs[0][1]
|
| 182 |
+
for x in xs:
|
| 183 |
+
if len(ret) >= size or x[1] != lastuntil:
|
| 184 |
+
yield ret, lastuntil
|
| 185 |
+
ret = []
|
| 186 |
+
lastuntil = x[1]
|
| 187 |
+
ret.append(x)
|
| 188 |
+
|
| 189 |
+
if ret:
|
| 190 |
+
yield ret, lastuntil
|
| 191 |
+
|
| 192 |
+
# todo: more intelligent batching for heterogeneous `until`
|
| 193 |
+
for chunk, until in tqdm(
|
| 194 |
+
list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
|
| 195 |
+
):
|
| 196 |
+
inps = []
|
| 197 |
+
for request in chunk:
|
| 198 |
+
context = request[0]
|
| 199 |
+
context_enc = self.tok_encode(context)
|
| 200 |
+
inp = context_enc[-(self.max_length - self.max_gen_toks) :]
|
| 201 |
+
inps.append(inp)
|
| 202 |
+
|
| 203 |
+
response = oa_completion(
|
| 204 |
+
engine=self.engine,
|
| 205 |
+
prompt=inps,
|
| 206 |
+
max_tokens=self.max_gen_toks,
|
| 207 |
+
temperature=0.0,
|
| 208 |
+
logprobs=10,
|
| 209 |
+
stop=until,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
for resp, request in zip(response.choices, chunk):
|
| 213 |
+
context = request[0]
|
| 214 |
+
until_ = request[1]
|
| 215 |
+
s = resp["text"]
|
| 216 |
+
|
| 217 |
+
for term in until_:
|
| 218 |
+
s = s.split(term)[0]
|
| 219 |
+
|
| 220 |
+
# partial caching
|
| 221 |
+
self.cache_hook.add_partial("greedy_until", (context, until_), s)
|
| 222 |
+
|
| 223 |
+
res.append(s)
|
| 224 |
+
|
| 225 |
+
return re_ord.get_original(res)
|
| 226 |
+
|
| 227 |
+
def _model_call(self, inps):
|
| 228 |
+
# Isn't used because we override _loglikelihood_tokens
|
| 229 |
+
raise NotImplementedError()
|
| 230 |
+
|
| 231 |
+
def _model_generate(self, context, max_length, eos_token_id):
|
| 232 |
+
# Isn't used because we override greedy_until
|
| 233 |
+
raise NotImplementedError()
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py
ADDED
|
@@ -0,0 +1,740 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
import transformers
|
| 5 |
+
import peft
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Mapping, NewType, Optional, Tuple, Union
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
|
| 10 |
+
from transformers import BatchEncoding
|
| 11 |
+
|
| 12 |
+
from lm_eval import utils
|
| 13 |
+
from lm_eval.base import BaseLM
|
| 14 |
+
|
| 15 |
+
TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
|
| 16 |
+
|
| 17 |
+
_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]])
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _get_accelerate_args(
|
| 21 |
+
device_map_option: Optional[str] = "auto",
|
| 22 |
+
max_memory_per_gpu: Optional[Union[int, str]] = None,
|
| 23 |
+
max_cpu_memory: Optional[Union[int, str]] = None,
|
| 24 |
+
offload_folder: Optional[str] = "./offload",
|
| 25 |
+
) -> dict:
|
| 26 |
+
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
|
| 27 |
+
max_memory = {}
|
| 28 |
+
if max_memory_per_gpu is not None:
|
| 29 |
+
max_memory_per_gpu_map = {
|
| 30 |
+
device_idx: max_memory_per_gpu
|
| 31 |
+
for device_idx in range(torch.cuda.device_count())
|
| 32 |
+
}
|
| 33 |
+
max_memory.update(max_memory_per_gpu_map)
|
| 34 |
+
if max_cpu_memory is not None:
|
| 35 |
+
max_memory["cpu"] = max_cpu_memory
|
| 36 |
+
|
| 37 |
+
args = {}
|
| 38 |
+
if max_memory:
|
| 39 |
+
args["max_memory"] = max_memory
|
| 40 |
+
args["device_map"] = device_map_option
|
| 41 |
+
args["offload_folder"] = offload_folder
|
| 42 |
+
return args
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _get_dtype(
|
| 46 |
+
dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None
|
| 47 |
+
) -> torch.dtype:
|
| 48 |
+
"""Converts `dtype` from `str` to torch.dtype when possible."""
|
| 49 |
+
if dtype is None and config is not None:
|
| 50 |
+
_torch_dtype = config.torch_dtype
|
| 51 |
+
elif isinstance(dtype, str) and dtype != "auto":
|
| 52 |
+
# Convert `str` args torch dtype: `float16` -> `torch.float16`
|
| 53 |
+
_torch_dtype = getattr(torch, dtype)
|
| 54 |
+
else:
|
| 55 |
+
_torch_dtype = dtype
|
| 56 |
+
return _torch_dtype
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class HuggingFaceAutoLM(BaseLM):
|
| 60 |
+
AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig
|
| 61 |
+
AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer
|
| 62 |
+
AUTO_MODEL_CLASS: transformers.AutoModel = None
|
| 63 |
+
AUTO_PEFT_CLASS: peft.PeftModel = None
|
| 64 |
+
|
| 65 |
+
# Default max sequence length setting for when no `max_length` is provided
|
| 66 |
+
# or no max length config setting is found in the model or tokenizer.
|
| 67 |
+
_DEFAULT_MAX_LENGTH: int = 2048
|
| 68 |
+
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
pretrained: str,
|
| 72 |
+
quantized: Optional[Union[bool, str]] = None,
|
| 73 |
+
tokenizer: Optional[str] = None,
|
| 74 |
+
subfolder: Optional[str] = None,
|
| 75 |
+
revision: Optional[str] = "main",
|
| 76 |
+
batch_size: Optional[int] = 1,
|
| 77 |
+
max_gen_toks: Optional[int] = 256,
|
| 78 |
+
max_length: Optional[int] = None,
|
| 79 |
+
add_special_tokens: Optional[bool] = None,
|
| 80 |
+
use_accelerate: Optional[bool] = False,
|
| 81 |
+
device_map_option: Optional[str] = "auto",
|
| 82 |
+
max_memory_per_gpu: Optional[Union[int, str]] = None,
|
| 83 |
+
max_cpu_memory: Optional[Union[int, str]] = None,
|
| 84 |
+
offload_folder: Optional[str] = "./offload",
|
| 85 |
+
dtype: Optional[Union[str, torch.dtype]] = None,
|
| 86 |
+
device: Optional[Union[int, str]] = "cuda",
|
| 87 |
+
peft: str = None,
|
| 88 |
+
load_in_8bit: Optional[bool] = False,
|
| 89 |
+
trust_remote_code: Optional[bool] = False,
|
| 90 |
+
use_fast: Optional[bool] = True,
|
| 91 |
+
gptq_use_triton: Optional[bool] = False,
|
| 92 |
+
):
|
| 93 |
+
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
|
| 94 |
+
Args:
|
| 95 |
+
pretrained (str):
|
| 96 |
+
The HuggingFace Hub model ID name or the path to a pre-trained
|
| 97 |
+
model to load. This is effectively the `pretrained_model_name_or_path`
|
| 98 |
+
argument of `from_pretrained` in the HuggingFace `transformers` API.
|
| 99 |
+
quantized (str or True, optional, defaults to None):
|
| 100 |
+
File name of a GPTQ quantized model to load. Set to `True` to use the
|
| 101 |
+
default name of the quantized model.
|
| 102 |
+
add_special_tokens (bool, optional, defaults to True):
|
| 103 |
+
Whether to add special tokens to the input sequences. If `None`, the
|
| 104 |
+
default value will be set to `True` for seq2seq models (e.g. T5) and
|
| 105 |
+
`False` for causal models.
|
| 106 |
+
WARNING: Evaluating causal models with `add_special_tokens=True` is
|
| 107 |
+
currently __not__ supported.
|
| 108 |
+
> Large model loading `accelerate` arguments
|
| 109 |
+
use_accelerate (bool, optional, defaults to False):
|
| 110 |
+
If True, uses the `accelerate` library to load a large model across
|
| 111 |
+
multiple devices.
|
| 112 |
+
device_map_option (str, optional, defaults to "auto"):
|
| 113 |
+
The device map option to use when loading the model with
|
| 114 |
+
`accelerate`.
|
| 115 |
+
Options:
|
| 116 |
+
"auto", "balanced", "balanced_low_0", "sequential"
|
| 117 |
+
See the `accelerate` docs for more details on these options:
|
| 118 |
+
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
|
| 119 |
+
max_memory_per_gpu (Union[int, str], optional, defaults to None):
|
| 120 |
+
The maximum memory available for each GPU in bytes as `int` or in
|
| 121 |
+
the format f"{significand}{unit_symbol}" where {unit_symbol} is
|
| 122 |
+
any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
|
| 123 |
+
the "Parameters for big model inference" section of the following
|
| 124 |
+
docs:
|
| 125 |
+
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
|
| 126 |
+
max_cpu_memory (Union[int, str], optional, defaults to None):
|
| 127 |
+
The maximum available CPU RAM in bytes as `int` or in the format
|
| 128 |
+
f"{significand}{unit_symbol}" where {unit_symbol} is any of
|
| 129 |
+
["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
|
| 130 |
+
"Parameters for big model inference" section of the following docs:
|
| 131 |
+
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
|
| 132 |
+
offload_folder (str, optional, defaults to "./offload"):
|
| 133 |
+
The folder to offload weights into if `device_map` contains any
|
| 134 |
+
"disk" value.
|
| 135 |
+
dtype (Union[str, torch.dtype], optional, defaults to None):):
|
| 136 |
+
Converts the model weights to `dtype`, if specified. Strings get
|
| 137 |
+
converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
|
| 138 |
+
Use `dtype="auto"` to derive the type from the model’s weights.
|
| 139 |
+
peft (str, optional, defaults to None):
|
| 140 |
+
Path of the adapter weights to load from Huggingface. This will usually
|
| 141 |
+
include a directory that includes the files `adapter_config.json` and
|
| 142 |
+
`adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
|
| 143 |
+
load_in_8bit (bool, optional, defaults to False):
|
| 144 |
+
If True, will convert the loaded model into mixed-8bit quantized model. See:
|
| 145 |
+
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
|
| 146 |
+
trust_remote_code (bool, optional, defaults to False):
|
| 147 |
+
If True, will trust the remote code when loading the model.
|
| 148 |
+
use_fast (bool, optional, defaults to True):
|
| 149 |
+
If True, will use the fast tokenizer when loading the model.
|
| 150 |
+
gptq_use_triton (bool, optional, defaults to False):
|
| 151 |
+
Use Triton for GPTQ inference.
|
| 152 |
+
"""
|
| 153 |
+
super().__init__()
|
| 154 |
+
|
| 155 |
+
assert isinstance(pretrained, str)
|
| 156 |
+
assert isinstance(device, str)
|
| 157 |
+
assert isinstance(batch_size, int)
|
| 158 |
+
if (
|
| 159 |
+
add_special_tokens is not None
|
| 160 |
+
and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM
|
| 161 |
+
):
|
| 162 |
+
# TODO: Support evaluating causal models with special tokens. Currently,
|
| 163 |
+
# this is not possible because the `_loglikelihood_tokens()` method for
|
| 164 |
+
# causal LMs makes a no-special-tokens assumption given that contexts
|
| 165 |
+
# and labels/continuations are tokenized separately without special
|
| 166 |
+
# tokens, concatenated, and then processed as inputs.
|
| 167 |
+
assert (
|
| 168 |
+
not add_special_tokens
|
| 169 |
+
), "Evaluating causal models with `add_special_tokens=True` is currently not supported."
|
| 170 |
+
|
| 171 |
+
self._batch_size = batch_size # TODO: Adaptive batch size
|
| 172 |
+
self._max_gen_toks = max_gen_toks
|
| 173 |
+
self._max_length = max_length
|
| 174 |
+
self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
|
| 175 |
+
pretrained,
|
| 176 |
+
trust_remote_code=trust_remote_code,
|
| 177 |
+
revision=revision + ("/" + subfolder if subfolder is not None else ""),
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
self._add_special_tokens = add_special_tokens
|
| 181 |
+
self.tokenizer = self._create_auto_tokenizer(
|
| 182 |
+
pretrained=pretrained,
|
| 183 |
+
revision=revision,
|
| 184 |
+
subfolder=subfolder,
|
| 185 |
+
tokenizer=tokenizer,
|
| 186 |
+
use_fast=use_fast,
|
| 187 |
+
)
|
| 188 |
+
self.tokenizer.model_max_length = self.max_length
|
| 189 |
+
|
| 190 |
+
model_kwargs = {}
|
| 191 |
+
if use_accelerate:
|
| 192 |
+
model_kwargs = _get_accelerate_args(
|
| 193 |
+
device_map_option,
|
| 194 |
+
max_memory_per_gpu,
|
| 195 |
+
max_cpu_memory,
|
| 196 |
+
offload_folder,
|
| 197 |
+
)
|
| 198 |
+
model_kwargs["load_in_8bit"] = load_in_8bit
|
| 199 |
+
self.model = self._create_auto_model(
|
| 200 |
+
pretrained=pretrained,
|
| 201 |
+
quantized=quantized,
|
| 202 |
+
trust_remote_code=trust_remote_code,
|
| 203 |
+
revision=revision,
|
| 204 |
+
subfolder=subfolder,
|
| 205 |
+
torch_dtype=_get_dtype(dtype, self._config),
|
| 206 |
+
gptq_use_triton=gptq_use_triton,
|
| 207 |
+
**model_kwargs,
|
| 208 |
+
)
|
| 209 |
+
# note: peft_path can be different than pretrained model path
|
| 210 |
+
if peft is not None:
|
| 211 |
+
self.model = self._create_auto_model_peft(
|
| 212 |
+
model=self.model,
|
| 213 |
+
peft=peft,
|
| 214 |
+
revision=revision,
|
| 215 |
+
subfolder=subfolder,
|
| 216 |
+
torch_dtype=_get_dtype(dtype, self._config),
|
| 217 |
+
**model_kwargs,
|
| 218 |
+
)
|
| 219 |
+
self.model.eval()
|
| 220 |
+
torch.set_grad_enabled(False)
|
| 221 |
+
|
| 222 |
+
self._device = device
|
| 223 |
+
if use_accelerate and "lm_head" in self.model.hf_device_map:
|
| 224 |
+
# `accelerate` can place `lm_head` weights on a different device than
|
| 225 |
+
# the user specified one so we force `self._device` to be the same as
|
| 226 |
+
# `lm_head`'s.
|
| 227 |
+
self._device = self.model.hf_device_map["lm_head"]
|
| 228 |
+
if not use_accelerate and not load_in_8bit:
|
| 229 |
+
try:
|
| 230 |
+
self.model.to(self._device)
|
| 231 |
+
except: # noqa: E722
|
| 232 |
+
print(
|
| 233 |
+
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
def _create_auto_model(
|
| 237 |
+
self,
|
| 238 |
+
*,
|
| 239 |
+
pretrained: str,
|
| 240 |
+
quantized: Optional[Union[bool, str]] = None,
|
| 241 |
+
revision: str,
|
| 242 |
+
subfolder: str,
|
| 243 |
+
device_map: Optional[Union[str, _DeviceMapping]] = None,
|
| 244 |
+
max_memory: Optional[dict] = None,
|
| 245 |
+
offload_folder: Optional[str] = None,
|
| 246 |
+
load_in_8bit: Optional[bool] = False,
|
| 247 |
+
trust_remote_code: Optional[bool] = False,
|
| 248 |
+
torch_dtype: Optional[Union[str, torch.dtype]] = None,
|
| 249 |
+
gptq_use_triton: Optional[bool] = False,
|
| 250 |
+
) -> transformers.AutoModel:
|
| 251 |
+
"""Returns a pre-trained pytorch model from a pre-trained model configuration."""
|
| 252 |
+
if quantized is None:
|
| 253 |
+
model = self.AUTO_MODEL_CLASS.from_pretrained(
|
| 254 |
+
pretrained,
|
| 255 |
+
revision=revision + ("/" + subfolder if subfolder is not None else ""),
|
| 256 |
+
device_map=device_map,
|
| 257 |
+
max_memory=max_memory,
|
| 258 |
+
offload_folder=offload_folder,
|
| 259 |
+
load_in_8bit=load_in_8bit,
|
| 260 |
+
trust_remote_code=trust_remote_code,
|
| 261 |
+
torch_dtype=torch_dtype,
|
| 262 |
+
)
|
| 263 |
+
else:
|
| 264 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 265 |
+
|
| 266 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
| 267 |
+
pretrained,
|
| 268 |
+
model_basename=None if quantized is True else Path(quantized).stem,
|
| 269 |
+
device_map=device_map,
|
| 270 |
+
max_memory=max_memory,
|
| 271 |
+
trust_remote_code=trust_remote_code,
|
| 272 |
+
use_safetensors=True
|
| 273 |
+
if quantized is True
|
| 274 |
+
else quantized.endswith(".safetensors"),
|
| 275 |
+
use_triton=gptq_use_triton,
|
| 276 |
+
warmup_triton=gptq_use_triton,
|
| 277 |
+
)
|
| 278 |
+
return model
|
| 279 |
+
|
| 280 |
+
def _create_auto_model_peft(
|
| 281 |
+
self,
|
| 282 |
+
*,
|
| 283 |
+
model: transformers.PreTrainedModel,
|
| 284 |
+
peft: str,
|
| 285 |
+
revision: str,
|
| 286 |
+
subfolder: str,
|
| 287 |
+
device_map: Optional[Union[str, _DeviceMapping]] = None,
|
| 288 |
+
max_memory: Optional[dict] = None,
|
| 289 |
+
offload_folder: Optional[str] = None,
|
| 290 |
+
load_in_8bit: Optional[bool] = False,
|
| 291 |
+
trust_remote_code: Optional[bool] = False,
|
| 292 |
+
torch_dtype: Optional[Union[str, torch.dtype]] = None,
|
| 293 |
+
):
|
| 294 |
+
model = self.AUTO_PEFT_CLASS.from_pretrained(
|
| 295 |
+
model,
|
| 296 |
+
peft,
|
| 297 |
+
revision=revision + ("/" + subfolder if subfolder is not None else ""),
|
| 298 |
+
device_map=device_map,
|
| 299 |
+
max_memory=max_memory,
|
| 300 |
+
offload_folder=offload_folder,
|
| 301 |
+
load_in_8bit=load_in_8bit,
|
| 302 |
+
trust_remote_code=trust_remote_code,
|
| 303 |
+
torch_dtype=torch_dtype,
|
| 304 |
+
)
|
| 305 |
+
return model
|
| 306 |
+
|
| 307 |
+
def _create_auto_tokenizer(
|
| 308 |
+
self,
|
| 309 |
+
*,
|
| 310 |
+
pretrained: str,
|
| 311 |
+
revision: str,
|
| 312 |
+
subfolder: str,
|
| 313 |
+
tokenizer: Optional[str] = None,
|
| 314 |
+
use_fast: Optional[bool] = True,
|
| 315 |
+
) -> transformers.PreTrainedTokenizer:
|
| 316 |
+
"""Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
|
| 317 |
+
tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained(
|
| 318 |
+
pretrained if tokenizer is None else tokenizer,
|
| 319 |
+
revision=revision + ("/" + subfolder if subfolder is not None else ""),
|
| 320 |
+
use_fast=use_fast,
|
| 321 |
+
)
|
| 322 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 323 |
+
return tokenizer
|
| 324 |
+
|
| 325 |
+
@property
|
| 326 |
+
def add_special_tokens(self) -> bool:
|
| 327 |
+
"""Whether to include special tokens in encoded text. This should be
|
| 328 |
+
determined by whether or not the model was trained with special tokens.
|
| 329 |
+
TODO: Remove these conditionals once HuggingFace supports a way to
|
| 330 |
+
check whether or not an arbitrary model was trained with special tokens.
|
| 331 |
+
"""
|
| 332 |
+
if self._add_special_tokens is not None:
|
| 333 |
+
return self._add_special_tokens
|
| 334 |
+
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
|
| 335 |
+
return False
|
| 336 |
+
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
|
| 337 |
+
return True
|
| 338 |
+
else:
|
| 339 |
+
raise ValueError(
|
| 340 |
+
"Could not determine `add_special_tokens` value from the model "
|
| 341 |
+
"class. Set to `True` or `False` depending on whether the model "
|
| 342 |
+
"was pre-trained with special tokens."
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
@property
|
| 346 |
+
def eot_token(self) -> str:
|
| 347 |
+
return self.tokenizer.eos_token
|
| 348 |
+
|
| 349 |
+
@property
|
| 350 |
+
def eot_token_id(self) -> int:
|
| 351 |
+
return self.tokenizer.eos_token_id
|
| 352 |
+
|
| 353 |
+
@property
|
| 354 |
+
def max_gen_toks(self) -> int:
|
| 355 |
+
return self._max_gen_toks
|
| 356 |
+
|
| 357 |
+
@property
|
| 358 |
+
def max_length(self) -> int:
|
| 359 |
+
"""Return the maximum sequence length of the model.
|
| 360 |
+
NOTE: Different model configurations have different max sequence length
|
| 361 |
+
attribute names.
|
| 362 |
+
- n_positions: (CTRLConfig)
|
| 363 |
+
- max_position_embeddings: (BartConfig, RoFormerConfig)
|
| 364 |
+
- n_ctx: (GPT2Config)
|
| 365 |
+
NOTE: For relative position encoded models you should specify the max
|
| 366 |
+
sequence length of the model in the constructor via `max_length`.
|
| 367 |
+
"""
|
| 368 |
+
if self._max_length is not None:
|
| 369 |
+
return self._max_length
|
| 370 |
+
# Try to get the sequence length from the model config.
|
| 371 |
+
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
|
| 372 |
+
for attr in seqlen_config_attrs:
|
| 373 |
+
if hasattr(self._config, attr):
|
| 374 |
+
return getattr(self._config, attr)
|
| 375 |
+
if hasattr(self.tokenizer, "model_max_length"):
|
| 376 |
+
return self.tokenizer.model_max_length
|
| 377 |
+
return self._DEFAULT_MAX_LENGTH
|
| 378 |
+
|
| 379 |
+
@property
|
| 380 |
+
def batch_size(self) -> int:
|
| 381 |
+
# TODO: Add adaptive batch size.
|
| 382 |
+
return self._batch_size # * gpus
|
| 383 |
+
|
| 384 |
+
@property
|
| 385 |
+
def device(self) -> Union[int, str, torch.device]:
|
| 386 |
+
return self._device
|
| 387 |
+
|
| 388 |
+
def tok_encode(self, string: str) -> TokenSequence:
|
| 389 |
+
# TODO: Merge `tok_encode_batch` here.
|
| 390 |
+
return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens)
|
| 391 |
+
|
| 392 |
+
def tok_encode_batch(self, strings: List[str]) -> TokenSequence:
|
| 393 |
+
return self.tokenizer(
|
| 394 |
+
strings,
|
| 395 |
+
padding=True,
|
| 396 |
+
add_special_tokens=self.add_special_tokens,
|
| 397 |
+
return_tensors="pt",
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
|
| 401 |
+
return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
|
| 402 |
+
|
| 403 |
+
def greedy_until(
|
| 404 |
+
self, requests: List[Tuple[str, Union[List[str], str]]]
|
| 405 |
+
) -> List[str]:
|
| 406 |
+
def _collate(x):
|
| 407 |
+
tokens = self.tok_encode(x[0])
|
| 408 |
+
return len(tokens), x[0]
|
| 409 |
+
|
| 410 |
+
results = []
|
| 411 |
+
reorder = utils.Reorderer(requests, _collate)
|
| 412 |
+
for chunk in utils.chunks(
|
| 413 |
+
tqdm(reorder.get_reordered(), disable=False), self.batch_size
|
| 414 |
+
):
|
| 415 |
+
context = [c[0] for c in chunk]
|
| 416 |
+
request_args = chunk[0][1]
|
| 417 |
+
stop_sequences = (
|
| 418 |
+
request_args if isinstance(request_args, list) else [request_args]
|
| 419 |
+
) # request_args["stop_sequences"]
|
| 420 |
+
max_generation_length = (
|
| 421 |
+
self._max_gen_toks
|
| 422 |
+
) # request_args["max_generation_length"]
|
| 423 |
+
|
| 424 |
+
assert (
|
| 425 |
+
isinstance(max_generation_length, int) or max_generation_length is None
|
| 426 |
+
)
|
| 427 |
+
assert isinstance(stop_sequences, list) or stop_sequences is None
|
| 428 |
+
|
| 429 |
+
# TODO: Find a better way to handle stop sequences for 0-shot.
|
| 430 |
+
if stop_sequences is None:
|
| 431 |
+
until = [self.eot_token]
|
| 432 |
+
else:
|
| 433 |
+
until = stop_sequences + [self.eot_token]
|
| 434 |
+
|
| 435 |
+
if max_generation_length is None:
|
| 436 |
+
max_tokens = self.max_gen_toks
|
| 437 |
+
else:
|
| 438 |
+
max_tokens = max_generation_length
|
| 439 |
+
|
| 440 |
+
token_context = self.tok_encode_batch(context)
|
| 441 |
+
|
| 442 |
+
responses = self._model_generate(
|
| 443 |
+
inputs=token_context,
|
| 444 |
+
max_tokens=max_tokens,
|
| 445 |
+
stop=until,
|
| 446 |
+
)
|
| 447 |
+
responses = self.tok_decode(responses.tolist())
|
| 448 |
+
|
| 449 |
+
for response in responses:
|
| 450 |
+
# Ensure the generated responses do not contain the stop sequences.
|
| 451 |
+
for term in until:
|
| 452 |
+
response = response.split(term)[0]
|
| 453 |
+
# partial caching
|
| 454 |
+
self.cache_hook.add_partial("greedy_until", (context, until), response)
|
| 455 |
+
results.append(response)
|
| 456 |
+
return reorder.get_original(results)
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
class AutoCausalLM(HuggingFaceAutoLM):
|
| 460 |
+
"""Causal language modeling.
|
| 461 |
+
You can find a set of supported models in the HF documentation:
|
| 462 |
+
https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM
|
| 463 |
+
"""
|
| 464 |
+
|
| 465 |
+
AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
|
| 466 |
+
AUTO_PEFT_CLASS = peft.PeftModel
|
| 467 |
+
|
| 468 |
+
def _create_auto_tokenizer(
|
| 469 |
+
self,
|
| 470 |
+
*,
|
| 471 |
+
pretrained: str,
|
| 472 |
+
revision: str,
|
| 473 |
+
subfolder: str,
|
| 474 |
+
tokenizer: Optional[str] = None,
|
| 475 |
+
use_fast: Optional[bool] = True,
|
| 476 |
+
) -> transformers.PreTrainedTokenizer:
|
| 477 |
+
tokenizer = super()._create_auto_tokenizer(
|
| 478 |
+
pretrained=pretrained,
|
| 479 |
+
revision=revision,
|
| 480 |
+
subfolder=subfolder,
|
| 481 |
+
tokenizer=tokenizer,
|
| 482 |
+
use_fast=use_fast,
|
| 483 |
+
)
|
| 484 |
+
tokenizer.padding_side = "left"
|
| 485 |
+
return tokenizer
|
| 486 |
+
|
| 487 |
+
def _model_call(
|
| 488 |
+
self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
|
| 489 |
+
) -> TokenSequence:
|
| 490 |
+
return self.model(inputs)["logits"]
|
| 491 |
+
|
| 492 |
+
def _model_generate(
|
| 493 |
+
self,
|
| 494 |
+
inputs: transformers.BatchEncoding,
|
| 495 |
+
max_tokens: int,
|
| 496 |
+
stop: Optional[List[str]] = None,
|
| 497 |
+
) -> TokenSequence:
|
| 498 |
+
# Ensure that the context does not encroach into the `space`
|
| 499 |
+
# for the generation.
|
| 500 |
+
input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :]
|
| 501 |
+
attention_mask = inputs["attention_mask"][
|
| 502 |
+
:, self.max_gen_toks - self.max_length :
|
| 503 |
+
]
|
| 504 |
+
input_ids = input_ids.to(self.device)
|
| 505 |
+
attention_mask = attention_mask.to(self.device)
|
| 506 |
+
|
| 507 |
+
stopping_criteria = stop_sequences_criteria(
|
| 508 |
+
self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0]
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
generations = self.model.generate(
|
| 512 |
+
input_ids=input_ids,
|
| 513 |
+
attention_mask=attention_mask,
|
| 514 |
+
# GPT style models require the `generate` `max_length` arg to include the
|
| 515 |
+
# context length, so we instead set `max_new_tokens` which is the number
|
| 516 |
+
# of new tokens to generate, excluding the current number of tokens.
|
| 517 |
+
max_new_tokens=max_tokens,
|
| 518 |
+
stopping_criteria=stopping_criteria,
|
| 519 |
+
do_sample=False,
|
| 520 |
+
)
|
| 521 |
+
return utils.select_continuation_from_batch_left_padding(
|
| 522 |
+
generations, max_context_size=inputs["input_ids"].size(1)
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
class AutoSeq2SeqLM(HuggingFaceAutoLM):
|
| 527 |
+
"""Seq2Seq language modeling.
|
| 528 |
+
You can find a set of supported models in the following documentation:
|
| 529 |
+
https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM
|
| 530 |
+
"""
|
| 531 |
+
|
| 532 |
+
AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
|
| 533 |
+
AUTO_PEFT_CLASS = peft.PeftModel
|
| 534 |
+
|
| 535 |
+
@property
|
| 536 |
+
def max_length(self) -> int:
|
| 537 |
+
"""Return the maximum sequence length of the model.
|
| 538 |
+
TODO: Currently only works for relative position encoded Seq2Seq models.
|
| 539 |
+
"""
|
| 540 |
+
if self._max_length is not None:
|
| 541 |
+
return self._max_length
|
| 542 |
+
return self._DEFAULT_MAX_LENGTH
|
| 543 |
+
|
| 544 |
+
def loglikelihood(
|
| 545 |
+
self, requests: List[Tuple[str, str]]
|
| 546 |
+
) -> List[Tuple[float, bool]]:
|
| 547 |
+
new_requests = []
|
| 548 |
+
for chunk in utils.chunks(requests, self.batch_size):
|
| 549 |
+
context, continuation = zip(*chunk)
|
| 550 |
+
|
| 551 |
+
# Fill empty contexts with the EOT token.
|
| 552 |
+
context = [
|
| 553 |
+
f"{self.eot_token}" if len(text) == 0 else text for text in context
|
| 554 |
+
]
|
| 555 |
+
context_enc = self.tok_encode_batch(context)
|
| 556 |
+
for key in context_enc:
|
| 557 |
+
context_enc[key] = context_enc[key][:, -self.max_length :]
|
| 558 |
+
|
| 559 |
+
# Remove leading whitespace introduced by the default
|
| 560 |
+
# `text_target_separator` since the context and continuation
|
| 561 |
+
# will not be concatenated as a single (decoder) input.
|
| 562 |
+
continuation = [text.lstrip() for text in continuation]
|
| 563 |
+
continuation_enc = self.tok_encode_batch(list(continuation))
|
| 564 |
+
for key in continuation_enc:
|
| 565 |
+
continuation_enc[key] = continuation_enc[key][:, -self.max_length :]
|
| 566 |
+
|
| 567 |
+
new_requests.append(
|
| 568 |
+
((context, continuation), context_enc, continuation_enc)
|
| 569 |
+
)
|
| 570 |
+
return self._loglikelihood_tokens(new_requests)
|
| 571 |
+
|
| 572 |
+
def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]:
|
| 573 |
+
loglikelihoods = []
|
| 574 |
+
for (string,) in tqdm(requests):
|
| 575 |
+
rolling_token_windows = list(
|
| 576 |
+
map(
|
| 577 |
+
utils.make_disjoint_window,
|
| 578 |
+
utils.get_rolling_token_windows(
|
| 579 |
+
token_list=self.tok_encode(string),
|
| 580 |
+
prefix_token=self.eot_token_id,
|
| 581 |
+
max_seq_len=self.max_length,
|
| 582 |
+
context_len=1,
|
| 583 |
+
),
|
| 584 |
+
)
|
| 585 |
+
)
|
| 586 |
+
contexts, conts = utils.split_and_pad_windows(
|
| 587 |
+
rolling_token_windows,
|
| 588 |
+
pad_token_id=self.eot_token_id,
|
| 589 |
+
max_seq_len=self.max_length,
|
| 590 |
+
)
|
| 591 |
+
# Manually create BatchEncoding tensors with attention masks as
|
| 592 |
+
# expected by `self._model_call` in `self._loglikelihood_tokens`.
|
| 593 |
+
contexts_enc = torch.Tensor(contexts).long()
|
| 594 |
+
contexts_enc = transformers.tokenization_utils_base.BatchEncoding(
|
| 595 |
+
{
|
| 596 |
+
"input_ids": contexts_enc,
|
| 597 |
+
"attention_mask": (contexts_enc != self.eot_token_id).long(),
|
| 598 |
+
}
|
| 599 |
+
)
|
| 600 |
+
conts_enc = torch.Tensor(conts).long()
|
| 601 |
+
conts_enc = transformers.tokenization_utils_base.BatchEncoding(
|
| 602 |
+
{
|
| 603 |
+
"input_ids": conts_enc,
|
| 604 |
+
"attention_mask": (conts_enc != self.eot_token_id).long(),
|
| 605 |
+
}
|
| 606 |
+
)
|
| 607 |
+
# TODO: Extract out this call so it only gets called once and also
|
| 608 |
+
# somehow figure out partial caching for.
|
| 609 |
+
rolling_token_windows_request = [
|
| 610 |
+
((contexts, conts), contexts_enc, conts_enc)
|
| 611 |
+
]
|
| 612 |
+
string_nll = self._loglikelihood_tokens(
|
| 613 |
+
rolling_token_windows_request, disable_tqdm=True
|
| 614 |
+
)
|
| 615 |
+
string_nll = [x[0] for x in string_nll] # discard is_greedy
|
| 616 |
+
string_nll = sum(string_nll)
|
| 617 |
+
loglikelihoods.append(string_nll)
|
| 618 |
+
return loglikelihoods
|
| 619 |
+
|
| 620 |
+
def _loglikelihood_tokens(
|
| 621 |
+
self,
|
| 622 |
+
requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]],
|
| 623 |
+
disable_tqdm: Optional[bool] = False,
|
| 624 |
+
) -> List[Tuple[float, bool]]:
|
| 625 |
+
results = []
|
| 626 |
+
for chunk in tqdm(
|
| 627 |
+
requests, total=math.ceil(len(requests)), disable=disable_tqdm
|
| 628 |
+
):
|
| 629 |
+
cache_keys, inputs_tokens, targets_tokens = chunk
|
| 630 |
+
inputs_tokens = inputs_tokens.to(self.device)
|
| 631 |
+
targets_tokens = targets_tokens.to(self.device)
|
| 632 |
+
outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens)
|
| 633 |
+
log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
|
| 634 |
+
|
| 635 |
+
output_iterator = zip(
|
| 636 |
+
zip(cache_keys[0], cache_keys[1]),
|
| 637 |
+
log_softmaxes,
|
| 638 |
+
targets_tokens["input_ids"],
|
| 639 |
+
targets_tokens["attention_mask"],
|
| 640 |
+
)
|
| 641 |
+
for cache_key, log_softmax, target_tokens, target_mask in output_iterator:
|
| 642 |
+
length = target_mask.sum()
|
| 643 |
+
log_softmax = log_softmax[:length]
|
| 644 |
+
target_tokens = target_tokens[:length]
|
| 645 |
+
greedy_tokens = log_softmax.argmax(dim=-1)
|
| 646 |
+
max_equal = (greedy_tokens == target_tokens).all()
|
| 647 |
+
target_logits = torch.gather(
|
| 648 |
+
log_softmax, 1, target_tokens.unsqueeze(-1)
|
| 649 |
+
).squeeze(-1)
|
| 650 |
+
answer = (float(target_logits.sum()), bool(max_equal))
|
| 651 |
+
results.append(answer)
|
| 652 |
+
if cache_key is not None:
|
| 653 |
+
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
|
| 654 |
+
return results
|
| 655 |
+
|
| 656 |
+
def _model_call(
|
| 657 |
+
self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
|
| 658 |
+
) -> TokenSequence:
|
| 659 |
+
return self.model(**inputs, labels=labels["input_ids"])
|
| 660 |
+
|
| 661 |
+
def _model_generate(
|
| 662 |
+
self,
|
| 663 |
+
inputs: transformers.BatchEncoding,
|
| 664 |
+
max_tokens: int,
|
| 665 |
+
stop: Optional[List[str]] = None,
|
| 666 |
+
) -> TokenSequence:
|
| 667 |
+
input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device)
|
| 668 |
+
attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device)
|
| 669 |
+
|
| 670 |
+
# Generate one token to calculate the number of start tokens prepended to decoder_input_ids
|
| 671 |
+
# (leaving this here in case the below assumption is violated in the future)
|
| 672 |
+
# one_tok_gen = self.model.generate(
|
| 673 |
+
# input_ids=torch.zeros((1, 1), dtype=torch.int),
|
| 674 |
+
# min_length=2,
|
| 675 |
+
# max_new_tokens=1,
|
| 676 |
+
# ).squeeze()
|
| 677 |
+
# initial_decoder_input_length = len(one_tok_gen) - 1
|
| 678 |
+
|
| 679 |
+
# Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models
|
| 680 |
+
stopping_criteria = stop_sequences_criteria(
|
| 681 |
+
self.tokenizer, stop, 1, input_ids.shape[0]
|
| 682 |
+
)
|
| 683 |
+
|
| 684 |
+
generations = self.model.generate(
|
| 685 |
+
input_ids=input_ids,
|
| 686 |
+
attention_mask=attention_mask,
|
| 687 |
+
max_new_tokens=max_tokens,
|
| 688 |
+
stopping_criteria=stopping_criteria,
|
| 689 |
+
do_sample=False,
|
| 690 |
+
)
|
| 691 |
+
return generations
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
|
| 695 |
+
"""Criteria to stop on the specified multi-token sequence."""
|
| 696 |
+
|
| 697 |
+
def __init__(
|
| 698 |
+
self,
|
| 699 |
+
sequence: str,
|
| 700 |
+
tokenizer: transformers.PreTrainedTokenizer,
|
| 701 |
+
initial_decoder_input_length: int,
|
| 702 |
+
batch_size: int,
|
| 703 |
+
):
|
| 704 |
+
self.initial_decoder_input_length = initial_decoder_input_length
|
| 705 |
+
self.done_tracker = [False] * batch_size
|
| 706 |
+
self.sequence = sequence
|
| 707 |
+
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
|
| 708 |
+
self.sequence_id_len = len(self.sequence_ids)
|
| 709 |
+
self.tokenizer = tokenizer
|
| 710 |
+
|
| 711 |
+
def __call__(self, input_ids, scores, **kwargs) -> bool:
|
| 712 |
+
# For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
|
| 713 |
+
lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
|
| 714 |
+
:, -self.sequence_id_len :
|
| 715 |
+
]
|
| 716 |
+
|
| 717 |
+
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
|
| 718 |
+
|
| 719 |
+
for i, done in enumerate(self.done_tracker):
|
| 720 |
+
if not done:
|
| 721 |
+
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
|
| 722 |
+
return False not in self.done_tracker
|
| 723 |
+
|
| 724 |
+
|
| 725 |
+
def stop_sequences_criteria(
|
| 726 |
+
tokenizer: transformers.PreTrainedTokenizer,
|
| 727 |
+
stop_sequences: List[str],
|
| 728 |
+
initial_decoder_input_length: int,
|
| 729 |
+
batch_size: int,
|
| 730 |
+
) -> transformers.StoppingCriteriaList:
|
| 731 |
+
return transformers.StoppingCriteriaList(
|
| 732 |
+
[
|
| 733 |
+
*[
|
| 734 |
+
MultiTokenEOSCriteria(
|
| 735 |
+
sequence, tokenizer, initial_decoder_input_length, batch_size
|
| 736 |
+
)
|
| 737 |
+
for sequence in stop_sequences
|
| 738 |
+
],
|
| 739 |
+
]
|
| 740 |
+
)
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" TextSynth API
|
| 2 |
+
Implementation provided by Fabrice Bellard:
|
| 3 |
+
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
|
| 4 |
+
|
| 5 |
+
In order to use the API, you must have a valid TextSynth account and
|
| 6 |
+
enough credits.
|
| 7 |
+
|
| 8 |
+
Example usage:
|
| 9 |
+
|
| 10 |
+
python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
|
| 11 |
+
|
| 12 |
+
Homepage: https://textsynth.com/index.html
|
| 13 |
+
"""
|
| 14 |
+
import logging
|
| 15 |
+
import os
|
| 16 |
+
import requests as _requests
|
| 17 |
+
import time
|
| 18 |
+
from tqdm import tqdm
|
| 19 |
+
from lm_eval.base import BaseLM
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def textsynth_completion(**kwargs):
|
| 26 |
+
"""Query TextSynth API for completion.
|
| 27 |
+
Retry with back-off until they respond.
|
| 28 |
+
"""
|
| 29 |
+
backoff_time = 3
|
| 30 |
+
while True:
|
| 31 |
+
try:
|
| 32 |
+
return _requests.post(**kwargs)
|
| 33 |
+
except _requests.exceptions.RequestException:
|
| 34 |
+
import traceback
|
| 35 |
+
|
| 36 |
+
traceback.print_exc()
|
| 37 |
+
time.sleep(backoff_time)
|
| 38 |
+
backoff_time *= 1.5
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TextSynthLM(BaseLM):
|
| 42 |
+
def __init__(self, engine, truncate=False):
|
| 43 |
+
"""
|
| 44 |
+
:param engine: str
|
| 45 |
+
TextSynth API engine (e.g. `gptj_6B`)
|
| 46 |
+
:param truncate: bool
|
| 47 |
+
Truncate input if too long (if False and input is too long, throw error)
|
| 48 |
+
"""
|
| 49 |
+
super().__init__()
|
| 50 |
+
|
| 51 |
+
self.engine = engine
|
| 52 |
+
self.truncate = truncate
|
| 53 |
+
self.api_url = "https://api.textsynth.com"
|
| 54 |
+
# Read from environment variable TEXTSYNTH_API_SECRET_KEY
|
| 55 |
+
self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def eot_token_id(self):
|
| 59 |
+
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
|
| 60 |
+
raise NotImplementedError()
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def max_length(self):
|
| 64 |
+
# NOTE: Turn on truncation to avoid errors on long inputs.
|
| 65 |
+
return 2048
|
| 66 |
+
|
| 67 |
+
@property
|
| 68 |
+
def max_gen_toks(self):
|
| 69 |
+
return 256
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def batch_size(self):
|
| 73 |
+
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
|
| 74 |
+
raise NotImplementedError()
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def device(self):
|
| 78 |
+
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
|
| 79 |
+
raise NotImplementedError()
|
| 80 |
+
|
| 81 |
+
def tok_encode(self, string: str):
|
| 82 |
+
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
|
| 83 |
+
raise NotImplementedError()
|
| 84 |
+
|
| 85 |
+
def tok_decode(self, tokens):
|
| 86 |
+
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
|
| 87 |
+
raise NotImplementedError()
|
| 88 |
+
|
| 89 |
+
def loglikelihood(self, requests):
|
| 90 |
+
res = []
|
| 91 |
+
for context, continuation in tqdm(requests):
|
| 92 |
+
response = textsynth_completion(
|
| 93 |
+
url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
|
| 94 |
+
headers={"Authorization": "Bearer " + self.api_key},
|
| 95 |
+
json={"context": context, "continuation": continuation},
|
| 96 |
+
)
|
| 97 |
+
resp = response.json()
|
| 98 |
+
if "logprob" in resp:
|
| 99 |
+
logprob = resp["logprob"]
|
| 100 |
+
is_greedy = resp["is_greedy"]
|
| 101 |
+
res.append((logprob, is_greedy))
|
| 102 |
+
else:
|
| 103 |
+
logger.error(
|
| 104 |
+
f"The following response does not contain `logprobs`. Got:\n{resp}"
|
| 105 |
+
)
|
| 106 |
+
assert False
|
| 107 |
+
return res
|
| 108 |
+
|
| 109 |
+
def loglikelihood_rolling(self, requests):
|
| 110 |
+
# TODO: The TextSynth API does not support tokenized inputs so we cannot
|
| 111 |
+
# manually partition long contexts into smaller rolling windows as
|
| 112 |
+
# done for other models derived from `BaseLM`. Override this method
|
| 113 |
+
# with a windowing scheme that works for direct string inputs.
|
| 114 |
+
raise NotImplementedError(
|
| 115 |
+
"`loglikelihood_rolling` is currently not supported due to lack of "
|
| 116 |
+
"input tokenization support from TextSynth."
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
def greedy_until(self, requests):
|
| 120 |
+
if not requests:
|
| 121 |
+
return []
|
| 122 |
+
|
| 123 |
+
res = []
|
| 124 |
+
for request in tqdm(requests):
|
| 125 |
+
inp = request[0]
|
| 126 |
+
until = request[1]
|
| 127 |
+
response = textsynth_completion(
|
| 128 |
+
url=self.api_url + "/v1/engines/" + self.engine + "/completions",
|
| 129 |
+
headers={"Authorization": "Bearer " + self.api_key},
|
| 130 |
+
json={
|
| 131 |
+
"prompt": inp,
|
| 132 |
+
"max_tokens": self.max_gen_toks,
|
| 133 |
+
"top_k": 1,
|
| 134 |
+
"stop": until,
|
| 135 |
+
},
|
| 136 |
+
)
|
| 137 |
+
resp = response.json()
|
| 138 |
+
if "text" in resp:
|
| 139 |
+
s = resp["text"]
|
| 140 |
+
res.append(s)
|
| 141 |
+
else:
|
| 142 |
+
logger.error(
|
| 143 |
+
f"The following response does not contain generated `text`. "
|
| 144 |
+
"Got:\n{resp}"
|
| 145 |
+
)
|
| 146 |
+
assert False
|
| 147 |
+
return res
|
| 148 |
+
|
| 149 |
+
def _model_call(self, inps):
|
| 150 |
+
# Isn't used because we override _loglikelihood_tokens
|
| 151 |
+
raise NotImplementedError()
|
| 152 |
+
|
| 153 |
+
def _model_generate(self, context, max_length, eos_token_id):
|
| 154 |
+
# Isn't used because we override greedy_until
|
| 155 |
+
raise NotImplementedError()
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (15.8 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc
ADDED
|
Binary file (5.54 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc
ADDED
|
Binary file (3.45 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc
ADDED
|
Binary file (5.3 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc
ADDED
|
Binary file (4.26 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc
ADDED
|
Binary file (15.7 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc
ADDED
|
Binary file (6.11 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc
ADDED
|
Binary file (6.31 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc
ADDED
|
Binary file (10.9 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc
ADDED
|
Binary file (5.53 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc
ADDED
|
Binary file (3.89 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc
ADDED
|
Binary file (3.54 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc
ADDED
|
Binary file (8.45 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc
ADDED
|
Binary file (6.22 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc
ADDED
|
Binary file (4.73 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc
ADDED
|
Binary file (2.9 kB). View file
|
|
|
scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc
ADDED
|
Binary file (4.05 kB). View file
|
|
|