diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4834200937f4ece26e98c677948ce315925e9811 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3f09999ba3c44646bef7d02361f12932f268805 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f07efc2f2deb0b4118ba473a5517ad0ad6cc915d Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9099b40d803811a8a8fb6d97b5efecd1c5751ab7 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51a3c608b71eb71df22b50e61995123d6ef621a7 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..01a026359725c91cd984633f07681c046bc0d2ae --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md @@ -0,0 +1,8 @@ +# datasets + +This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not. + +__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path. + + +__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed. diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee1306748cb4d983e3dcac4733490edba772a13e Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f91c157735f8450d605b7a3b45b02d026e954e93 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json new file mode 100644 index 0000000000000000000000000000000000000000..f155e7720d0aa5e330496c4f945f7c047424cc61 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json @@ -0,0 +1 @@ +{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}} diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..d892427b48f82dfb59b33f6844a7a5519ab09cf2 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py @@ -0,0 +1,192 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Custom DROP dataset that, unlike HF, keeps all question-answer pairs +# even if there are multiple types of answers for the same question. +"""DROP dataset.""" + + +import json +import os + +import datasets + + +_CITATION = """\ +@misc{dua2019drop, + title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, + author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner}, + year={2019}, + eprint={1903.00161}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + +_DESCRIPTION = """\ +DROP is a QA dataset which tests comprehensive understanding of paragraphs. In +this crowdsourced, adversarially-created, 96k question-answering benchmark, a +system must resolve multiple references in a question, map them onto a paragraph, +and perform discrete operations over them (such as addition, counting, or sorting). +""" + +_HOMEPAGE = "https://allenai.org/data/drop" + +# TODO: Add the licence for the dataset here if you can find it +_LICENSE = "" + +_URLS = { + "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip", +} + +_EMPTY_VALIDATED_ANSWER = [ + { + "number": "", + "date": { + "day": "", + "month": "", + "year": "", + }, + "spans": [], + "worker_id": "", + "hit_id": "", + } +] + + +class Drop(datasets.GeneratorBasedBuilder): + """DROP is a QA dataset which tests comprehensive understanding of paragraphs.""" + + VERSION = datasets.Version("0.0.1") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig( + name="drop", version=VERSION, description="The DROP dataset." + ), + ] + + def _info(self): + features = datasets.Features( + { + "section_id": datasets.Value("string"), + "passage": datasets.Value("string"), + "question": datasets.Value("string"), + "query_id": datasets.Value("string"), + "answer": { + "number": datasets.Value("string"), + "date": { + "day": datasets.Value("string"), + "month": datasets.Value("string"), + "year": datasets.Value("string"), + }, + "spans": datasets.features.Sequence(datasets.Value("string")), + "worker_id": datasets.Value("string"), + "hit_id": datasets.Value("string"), + }, + "validated_answers": datasets.features.Sequence( + { + "number": datasets.Value("string"), + "date": { + "day": datasets.Value("string"), + "month": datasets.Value("string"), + "year": datasets.Value("string"), + }, + "spans": datasets.features.Sequence(datasets.Value("string")), + "worker_id": datasets.Value("string"), + "hit_id": datasets.Value("string"), + } + ), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + urls = _URLS[self.config.name] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join( + data_dir, "drop_dataset", "drop_dataset_train.json" + ), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join( + data_dir, "drop_dataset", "drop_dataset_dev.json" + ), + "split": "validation", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + def _generate_examples(self, filepath, split): + with open(filepath, encoding="utf-8") as f: + data = json.load(f) + key = 0 + for section_id, example in data.items(): + # Each example (passage) has multiple sub-question-answer pairs. + for qa in example["qa_pairs"]: + # Build answer. + answer = qa["answer"] + answer = { + "number": answer["number"], + "date": { + "day": answer["date"].get("day", ""), + "month": answer["date"].get("month", ""), + "year": answer["date"].get("year", ""), + }, + "spans": answer["spans"], + "worker_id": answer.get("worker_id", ""), + "hit_id": answer.get("hit_id", ""), + } + validated_answers = [] + if "validated_answers" in qa: + for validated_answer in qa["validated_answers"]: + va = { + "number": validated_answer.get("number", ""), + "date": { + "day": validated_answer["date"].get("day", ""), + "month": validated_answer["date"].get("month", ""), + "year": validated_answer["date"].get("year", ""), + }, + "spans": validated_answer.get("spans", ""), + "worker_id": validated_answer.get("worker_id", ""), + "hit_id": validated_answer.get("hit_id", ""), + } + validated_answers.append(va) + else: + validated_answers = _EMPTY_VALIDATED_ANSWER + yield key, { + "section_id": section_id, + "passage": example["passage"], + "question": qa["question"], + "query_id": qa["query_id"], + "answer": answer, + "validated_answers": validated_answers, + } + key += 1 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f52fc50cf67eca96522c73fb069fdaeaacfa97e Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json new file mode 100644 index 0000000000000000000000000000000000000000..27d154efa50fd68aa23d3de656c9ce6449faed61 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json @@ -0,0 +1 @@ +{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n title={Measuring Mathematical Problem Solving With the Math Dataset},\n author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n journal={NeurIPS},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}} diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py new file mode 100644 index 0000000000000000000000000000000000000000..043adeeed6648ce04e3209b6359cb119699eddd1 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py @@ -0,0 +1,122 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MATH dataset.""" + + +import json +import os +import pathlib + +import datasets + + +_CITATION = """\ +@article{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the Math Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} +""" + +_DESCRIPTION = """\ +MATH is a dataset of 12,500 challenging competition mathematics problems. Each +problem in Math has a full step-by-step solution which can be used to teach +models to generate answer derivations and explanations. +""" + +_HOMEPAGE = "https://github.com/hendrycks/math" + +# TODO: Add the licence for the dataset here if you can find it +_LICENSE = "" + +_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar" + +_NAMES = [ + "algebra", + "counting_and_probability", + "geometry", + "intermediate_algebra", + "number_theory", + "prealgebra", + "precalculus", +] + + +class HendrycksMath(datasets.GeneratorBasedBuilder): + """MATH is a dataset of 12,500 challenging competition mathematics problems.""" + + VERSION = datasets.Version("0.0.1") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, version=version, description=name) + for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) + ] + + def _info(self): + features = datasets.Features( + { + "problem": datasets.Value("string"), + "level": datasets.Value("string"), + "type": datasets.Value("string"), + "solution": datasets.Value("string"), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + urls = _URLS + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "basepath": os.path.join( + data_dir, "MATH", "train", self.config.name + ), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "basepath": os.path.join( + data_dir, "MATH", "test", self.config.name + ), + "split": "test", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + def _generate_examples(self, basepath, split): + key = 0 + for file in sorted(pathlib.Path(basepath).iterdir()): + with open(file, "r", encoding="utf-8") as f: + data = json.load(f) + yield key, { + "problem": data["problem"], + "level": data["level"], + "type": data["type"], + "solution": data["solution"], + } + key += 1 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py new file mode 100644 index 0000000000000000000000000000000000000000..488a55dd7352d333d68a3557058ca81dfe704a38 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py @@ -0,0 +1,161 @@ +import os +import zstandard +import json +import jsonlines +import io +import datetime +import mmap +import tqdm +from pathlib import Path + + +def json_serial(obj): + """JSON serializer for objects not serializable by default json code""" + + if isinstance(obj, (datetime.datetime,)): + return obj.isoformat() + raise TypeError("Type %s not serializable" % type(obj)) + + +# Modified version of lm_dataformat Archive for single file. +class Archive: + def __init__(self, file_path, compression_level=3): + self.file_path = file_path + dir_name = os.path.dirname(file_path) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + self.fh = open(self.file_path, "wb") + self.cctx = zstandard.ZstdCompressor(level=compression_level) + self.compressor = self.cctx.stream_writer(self.fh) + + def add_data(self, data, meta={}): + self.compressor.write( + json.dumps({"text": data, "meta": meta}, default=json_serial).encode( + "UTF-8" + ) + + b"\n" + ) + + def commit(self): + self.compressor.flush(zstandard.FLUSH_FRAME) + self.fh.flush() + self.fh.close() + + +# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm. +class Reader: + def __init__(self): + pass + + def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"): + with open(file, "rb") as fh: + self.fh = fh + cctx = zstandard.ZstdDecompressor() + reader = io.BufferedReader(cctx.stream_reader(fh)) + rdr = jsonlines.Reader(reader) + for ob in rdr: + # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility. + if isinstance(ob, str): + assert not get_meta + yield ob + continue + + text = ob["text"] + + if autojoin_paragraphs and isinstance(text, list): + text = para_joiner.join(text) + + if get_meta: + yield text, (ob["meta"] if "meta" in ob else {}) + else: + yield text + + +class TextArchive: + def __init__(self, file_path, mode="rb+"): + self.file_path = file_path + dir_name = os.path.dirname(file_path) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + + if not os.path.exists(file_path): + Path(file_path).touch() + + self.fh = open(self.file_path, mode) + + def add_data(self, data): + self.fh.write(data.encode("UTF-8") + b"\n") + + def commit(self): + self.fh.flush() + self.fh.close() + + +class TextReader: + def __init__(self, file_path): + self.file_path = file_path + + # Optimized mmap read with infrequent tqdm updates to maintain speed + # Tested up to 250MB/s. + def read_tqdm(self, update_frequency=10000): + current_file_position = 0 + line_counter = 0 + with open(self.file_path, "r") as fh, tqdm.tqdm( + total=os.path.getsize(self.file_path), + dynamic_ncols=True, + unit="byte", + unit_scale=1, + ) as progress: + with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: + for line in iter(mmap_obj.readline, b""): + line = line.decode("utf-8") + line_counter += 1 + if line_counter == update_frequency: + new_file_pos = mmap_obj.tell() + bytes_read = new_file_pos - current_file_position + current_file_position = new_file_pos + progress.update(bytes_read) + line_counter = 0 + yield line[:-1] + + def read_and_tell(self): + current_file_position = 0 + with open(self.file_path, "r", encoding="utf8") as fh: + with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: + for line in iter(mmap_obj.readline, b""): + line = line.decode("utf-8") + new_file_pos = mmap_obj.tell() + raw_bytes_read = new_file_pos - current_file_position + current_file_position = new_file_pos + yield line[:-1], raw_bytes_read + + def read(self): + with open(self.file_path, "r", encoding="utf8") as fh: + with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj: + for line in iter(mmap_obj.readline, b""): + line = line.decode("utf-8") + yield line[:-1] + + def read_slow(self): + with open(self.file_path, "r", encoding="utf8") as fh: + while True: + line = fh.readline() + if line == -1 or line == "": + break + else: + yield line[:-1] + + +# Optimized for speed. Decompresses the archive in shell before +# using the mmap'd TextReader. +class ZStdTextReader: + def __init__(self, file): + self.file = file + + def read_tqdm(self): + decompressed_file = self.file[:-4] + print("Decompressing file, please wait...") + os.system(f"zstd -d {self.file}") # linux decompress is faster + reader = TextReader(decompressed_file) + yield from reader.read_tqdm() + os.remove(decompressed_file) diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py new file mode 100644 index 0000000000000000000000000000000000000000..ce81446006babfa675a05067cc736eb495b602b0 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py @@ -0,0 +1,169 @@ +import time +import random +import pickle +import json +import glob +import os +import collections + +from .janitor import Janitor, word_ngrams +from .archiver import ZStdTextReader + + +# Was used for testing the evaluator decoupled from the full logic below +def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size): + simulated_overlap = 0.1 + contaminated = int(len(docs) * simulated_overlap) + return random.sample(range(len(docs)), contaminated) + + +# Returns a dictionary containing all overlapping documents in each +# task. In the standard use case, an overlap occurs when any of the 13-grams +# found in the task document exist in the training set documents. +# +# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these +# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst" +# files. These should exist in the "ngrams_path" provided to this function. + +# Algorithm: +# 1. Build lookups for each dataset {ngram: list(document_ids)} +# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]} +# 3. Full scan the 13-grams from the training set against the merged lookup, +# saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)} +# 4. Strip the task_set from the dictionary keys and return +# +# We cache the task+set lookups as well as the overlaps. +def get_train_overlap(docs_by_task_set, ngrams_path, limit): + # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size) + + info_dict_path = os.path.join(ngrams_path, "info.json") + info_dict = json.load(open(info_dict_path, "r")) + ngrams_n_size = info_dict["ngram_size"] + + janitor = Janitor() + + # Build lookup for each dataset first in case we use different task combinations later + print("Building Lookups...") + start = time.perf_counter() + + def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit): + return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps" + + lookups = {} + duplicates = {} # (task_name, task_set): set(doc_ids)} + sets_to_decontaminate = len(docs_by_task_set.keys()) + + for (task_name, task_set), docs in docs_by_task_set.items(): + if not os.path.exists(f"data/{task_name}"): + os.mkdir(f"data/{task_name}") + + # Check if we've decontaminated this combination before + overlaps_dump_path = get_overlaps_dump_path( + task_name, task_set, ngrams_n_size, limit + ) + if os.path.exists(overlaps_dump_path): + duplicates[(task_name, task_set)] = pickle.load( + open(overlaps_dump_path, "rb") + ) + sets_to_decontaminate -= 1 + continue + else: + duplicates[(task_name, task_set)] = set() + + # Build/load the task lookup {ngram: set(documents)}. + task_set_lookup_path = ( + f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup" + ) + if os.path.exists(task_set_lookup_path): + print(f"{task_set_lookup_path} available, loading...") + lookups[(task_name, task_set)] = pickle.load( + open(task_set_lookup_path, "rb") + ) + else: + print(f"{task_set_lookup_path} not available, building...") + lookup = collections.defaultdict(set) + + for doc_id, document in enumerate(docs): + ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size) + for ngram in ngrams: + lookup[ngram].add(doc_id) + + pickle.dump(lookup, open(task_set_lookup_path, "wb")) + lookups[(task_name, task_set)] = lookup + + elapsed = time.perf_counter() - start + print(f"Building lookups took {elapsed:0.5f} seconds.") + + matched_ngrams = [] + + if sets_to_decontaminate > 0: + print("Merging lookups...") + start = time.perf_counter() + merged_lookup = collections.defaultdict(list) + for (task_name, task_set), lookup in lookups.items(): + for ngram, doc_ids in lookup.items(): + merged_lookup[ngram].append((task_name, task_set, doc_ids)) + + elapsed = time.perf_counter() - start + print(f"Merging lookups took {elapsed:0.5f} seconds.") + + print(f"{ngrams_n_size} grams files found in {ngrams_path}:") + files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst")) + print(files) + + for file in files: + start = time.perf_counter() + print(f"Scanning {file}") + reader = ZStdTextReader(file) + total_ngrams = 0 + unique_ngrams = 0 + matching_unique = 0 + non_matching_unique = 0 + + current_ngram = "" + for line in reader.read_tqdm(): # Scan training set ngrams file + total_ngrams += 1 + [ngram, document_id] = line.rsplit(" ", 1) + if ( + ngram != current_ngram + ): # Only need to match the ngram once in training set + unique_ngrams += 1 + current_ngram = ngram + if ngram in merged_lookup: + matched_ngrams.append(ngram) # For logging + matching_unique += 1 + for task_name, task_set, doc_ids in merged_lookup[ngram]: + task_doc_set = duplicates[(task_name, task_set)] + for ( + doc_id + ) in ( + doc_ids + ): # Record contamination across all relevant task/set combos + task_doc_set.add(doc_id) + del merged_lookup[ngram] # No point matching again + else: + non_matching_unique += 1 + + print(f"Total Ngrams: {total_ngrams}") + print(f"Unique Ngrams: {unique_ngrams}") + print(f"Unique Matching: {matching_unique}") + print(f"Unique Non Matching: {non_matching_unique}") + print("Matched ngrams:") + for ngram in matched_ngrams: + print(ngram) + + elapsed = time.perf_counter() - start + print(f"Read took {elapsed:0.5f} seconds.") + print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second") + + print(duplicates) + + # Dump overlaps separately + for (task_name, task_set), doc_ids in duplicates.items(): + overlaps_dump_path = get_overlaps_dump_path( + task_name, task_set, ngrams_n_size, limit + ) + pickle.dump(doc_ids, open(overlaps_dump_path, "wb")) + + # Strip task set and return + return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()} diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py new file mode 100644 index 0000000000000000000000000000000000000000..458ee223c279f961fa1dd9a934fe65f74b7199c0 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py @@ -0,0 +1,325 @@ +import re +import string +import timeit +import pickle +import traceback +from pprint import pprint + +# This is a cpp module. Compile janitor_util.cpp with: +# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup +try: + import janitor_util + + JANITOR_CPP = True +except Exception: + print("WARNING: C++ module could not be loaded. Janitor running in python mode") + traceback.print_exc() + JANITOR_CPP = False + + +# Implementation from nltk source +# https://www.nltk.org/_modules/nltk/util.html +def form_ngrams(sequence, n): + history = [] + while n > 1: + # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator + try: + next_item = next(sequence) + except StopIteration: + # no more data, terminate the generator + return + history.append(next_item) + n -= 1 + for item in sequence: + history.append(item) + yield tuple(history) + del history[0] + + +def word_ngrams(s, n): + """Splits a string into ngram words""" + tokens = s.split() # not a generator :( + ngram_seqs = form_ngrams(iter(tokens), n) + return (" ".join(ngram) for ngram in ngram_seqs) + + +# Does character sequences only - combined faster function to play around with later +# def word_ngrams_indices_combined(sequence, n): +# current_word = "" +# history = [] +# gap = False; +# start = 0 +# end = 0 +# for character in sequence: +# if character == " ": +# if not gap: +# gap = True +# history.append(current_word) +# end += len(current_word) - 1 +# current_word = "" +# if len(history) == n: +# yield (tuple(history), start, end) +# del history[0] +# start = end + 1 +# end = start +# else: +# gap = False +# current_word += character + + +# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python +def split_indices(s): + """Splits a string on whitespaces and records the indices of each in the original string. + @:return generator((word, (start_idx, end_idx)), ...) + """ + return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s)) + + +def word_ngrams_indices(s, n): + """Splits a string into pairs of (ngram words, their start/end indices)""" + tokens_with_indices = split_indices(s) + + # Generator of ngrams of (word, idx_pairs) + # ( + # [(word, (start,end)), (word, (start, end))...], + # [(word, (start, end)), ...], + # ... + # ) + ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n) + + # Generator of pairs of word and index ngrams + # ( + # ([word, word, ...], [(start,end), (start,end), ...]), + # ... + # ) + ngram_indices_pairs = ( + zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices + ) + + # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...) + return ( + (" ".join(ngram_seq), (indices[0][0], indices[-1][1])) + for ngram_seq, indices in ngram_indices_pairs + ) + + +class Janitor: + + # FIXME delete_chars: Should anything else go here? Special chars? + def __init__( + self, + ngram_n=13, + window_to_remove=200, + too_dirty_cutoff=10, + minimum_slice_length=200, + delete_chars=string.punctuation, + ): + self.ngram_n = ngram_n + self.window_to_remove = window_to_remove + self.too_dirty_cutoff = too_dirty_cutoff + self.minimum_slice_length = minimum_slice_length + self.delete_chars = delete_chars + + self.dirt_ngrams = set() + + # If in python, we'll translate uppercase to lowercase and delete naughty characters. + # This is fast by python standards + # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st + self.translation_table = str.maketrans( + string.ascii_lowercase + string.ascii_uppercase, # These characters + string.ascii_lowercase * 2, # Become these characters + self.delete_chars, # These are deleted + ) + + ############## + # I/O for saving contamination ngrams + ############## + + def save_contamination_ngrams(self, filename): + with open(filename, "wb") as fp: + pickle.dump(filename, fp) + + def load_contamination_ngrams(self, filename): + with open(filename, "rb") as fp: + self.dirt_ngrams = pickle.load(fp) + + ############## + # Call these :) + ############## + + def register_contaminant(self, dirt_string): + """Register a string as contamination to be removed, e.g. a test set + This breaks the dirt_string into ngrams to store for future cleaning""" + if JANITOR_CPP: + return self.register_contaminant_cpp(dirt_string) + else: + print("WARNING: Janitor running in python mode") + return self.register_contaminant_python(dirt_string) + + def clean(self, dirty_string): + """Clean a string (e.g. a training set) by removing all ngrams previously + registered as contaminants. Returns a list of clean chunks, or empty if + the string was too dirty""" + if JANITOR_CPP: + return self.clean_cpp(dirty_string) + else: + print("WARNING: Janitor running in python mode") + return self.clean_python(dirty_string) + + def _split_chunks(self, dirty_string, dirty_parts): + clean_chunks = [] + splice_idx = 0 + end = -1 + for i, (ngram, start, end) in enumerate(dirty_parts): + if i >= self.too_dirty_cutoff: + return [] + start = max(0, start - self.window_to_remove) + end = min(len(dirty_string), end + self.window_to_remove) + + if start - splice_idx > self.minimum_slice_length: + clean_chunks.append(dirty_string[splice_idx:start]) + splice_idx = end + + if end < len(dirty_string) - self.minimum_slice_length: + clean_chunks.append(dirty_string[end + 1 :]) + + return clean_chunks + + ############## + # Fast C++ + ############## + + def register_contaminant_cpp(self, dirt_string): + self.dirt_ngrams.update( + janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n) + ) + + def clean_cpp(self, dirty_string): + contamination_indices = janitor_util.clean_ngram_with_indices( + dirty_string, self.delete_chars, self.ngram_n + ) + return self._split_chunks(dirty_string, contamination_indices) + + ############## + # Slow python + ############## + + def normalize_string(self, s): + return s.translate(self.translation_table) + + def register_contaminant_python(self, dirt_string): + self.dirt_ngrams.update( + word_ngrams(self.normalize_string(dirt_string), self.ngram_n) + ) + + def clean_python(self, dirty_string): + contamination_indices = ( + (None, *idx_pair) + for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n) + if self.normalize_string(dirty_ngram) in self.dirt_ngrams + ) + return self._split_chunks(dirty_string, contamination_indices) + + +################################################################## +# Tests +################################################################# + +# def print_cpp(): +# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 + +# for i in range(1, 10, 2): +# pprint(janitor_util.clean_ngram(source, string.punctuation, i)) +# for ngram, start, end in \ +# janitor_util.clean_ngram_with_indices(source, string.punctuation, i): +# print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n")) + + +# def test_cpp(): +# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 +# contaminant = "dirty boy. Clean he he" + +# jan_python = Janitor() +# jan_cpp = Janitor() + +# jan_python.register_contaminant_python(contaminant) +# jan_cpp.register_contaminant(contaminant) + +# assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams) + +# assert jan_python.clean_python(source) == jan_cpp.clean(source), \ +# (jan_python.clean_python(source), jan_cpp.clean(source)) + +# print("Passed test, python==cpp") + + +# def benchmark(): +# # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html +# setup = \ +# """ +# with open("data/enwik8", "r") as f: +# data = f.read() +# jan = Janitor(too_dirty_cutoff=1000) +# jan.register_contaminant(''' +# theories is that there is a connection between "geekdom" and autism. +# This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled " +# The [[Geek]] Syndrome", which is a point argued by many in the autism rights +# movement{{ref|Wired}}. This article, many professionals assert, is just one example of +# the media's application of mental disease labels to what is actually variant normal behavior +# &mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual +# interests, even when they seem unusual to others, are not in themselves signs of autism or +# Asperger's syndrome. Others assert that it is actually the medical profession which is applying +# mental disease labels to children who in the past would have simply been accepted as a little +# different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue. +# Due to the recent publicity surrounding autism and autis +# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first, +# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first +# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties +# would last, took a cautious approach, preferring to save the revenue rather than investing it in +# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential +# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his +# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]], +# with the assistance of the British, Sheikh Zayed became the new ruler. See generally, Al-Fahim, M, +# ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995), +# ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the +# Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the +# [[United Arab Emirates]]. After the Emirates gained independence in 1971, +# ''') +# """ + +# n = 1 +# print(f"Timing {n} run on 100 MB") +# print("Register contaminant") +# # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n)) +# print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n)) + +# print("Clean") +# # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n)) +# print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n)) + + +# def test_janitor_general(): +# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2 +# contaminant = "dirty boy. Clean he he" + +# jan = Janitor(ngram_n=3) +# jan.register_contaminant(contaminant) +# cleaned = " ".join(jan.clean(source)) +# for contam in jan.dirt_ngrams: +# assert contam not in cleaned, contam + +# filename = "data/saved_contam" +# jan.save_contamination_ngrams(filename) + +# jan = Janitor(ngram_n=3) +# jan.load_contamination_ngrams(filename) +# cleaned = " ".join(jan.clean(source)) +# for contam in jan.dirt_ngrams: +# assert contam not in cleaned, contam + + +# if __name__ == "__main__": +# test() +# # print_cpp() +# # test_cpp() +# # benchmark() diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..63f92c46008d3f587e9970984a8c79974115aeed --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py @@ -0,0 +1,20 @@ +from . import gpt2 +from . import gpt3 +from . import huggingface +from . import textsynth +from . import dummy + +MODEL_REGISTRY = { + "hf": gpt2.HFLM, + "hf-causal": gpt2.HFLM, + "hf-causal-experimental": huggingface.AutoCausalLM, + "hf-seq2seq": huggingface.AutoSeq2SeqLM, + "gpt2": gpt2.GPT2LM, + "gpt3": gpt3.GPT3LM, + "textsynth": textsynth.TextSynthLM, + "dummy": dummy.DummyLM, +} + + +def get_model(model_name): + return MODEL_REGISTRY[model_name] diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..441a94ebc07e1161d3b390ea0b760817ba10b889 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec1863dd0bb00f0f2603b284a1d3cb2159d9eb21 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38b82f97121e04e2288f52b1aa52e0f78b540ccd Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79eb6493b67441a0c3252662d2ff1090566eec11 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9eaf73ef7b3b5ad7cf09b6da84302dad1a258651 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2181e839b93759521e66a6e1ed695f7691fe122 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..fe737c314459124ea7f05d86ea4309223ffef3a2 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py @@ -0,0 +1,36 @@ +import random +from lm_eval.base import LM + + +class DummyLM(LM): + def __init__(self): + pass + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config=None): + return cls() + + def loglikelihood(self, requests): + res = [] + + for _ in requests: + res.append((-random.random(), False)) + + return res + + def greedy_until(self, requests): + res = [] + + for ctx, _ in requests: + res.append("lol") + assert ctx.strip() != "" + + return res + + def loglikelihood_rolling(self, requests): + res = [] + + for _ in requests: + res.append(-random.random()) + + return res diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..c69b6142553d08438b88764b28e1a111ecc74a2a --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py @@ -0,0 +1,133 @@ +import torch +import transformers +from typing import Optional, Union +from lm_eval.base import BaseLM + + +class HFLM(BaseLM): + def __init__( + self, + device="cuda", + pretrained="gpt2", + revision="main", + low_cpu_mem_usage=None, + torch_dtype=None, + device_map=None, + subfolder=None, + tokenizer=None, + batch_size=1, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + use_fast: Optional[bool] = True, + ): + super().__init__() + + assert isinstance(device, str) + assert isinstance(pretrained, str) + assert isinstance(batch_size, int) + + if device: + if device not in ["cuda", "cpu"]: + device = int(device) + self._device = torch.device(device) + print(f"Using device '{device}'") + else: + print("Device not specified") + print(f"Cuda Available? {torch.cuda.is_available()}") + self._device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + + # TODO: update this to be less of a hack once subfolder is fixed in HF + revision = revision + ("/" + subfolder if subfolder is not None else "") + + self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained( + pretrained, + load_in_8bit=load_in_8bit, + low_cpu_mem_usage=low_cpu_mem_usage, + torch_dtype=torch_dtype, + device_map=device_map, + revision=revision, + trust_remote_code=trust_remote_code, + ).eval() + if not load_in_8bit: + try: + self.gpt2.to(self.device) + except: # noqa: E722 + print( + "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore." + ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained if tokenizer is None else tokenizer, + revision=revision, + trust_remote_code=trust_remote_code, + use_fast=use_fast, + ) + self.vocab_size = self.tokenizer.vocab_size + + # multithreading and batching + self.batch_size_per_gpu = batch_size # todo: adaptive batch size + + # TODO: fix multi-gpu + # gpus = torch.cuda.device_count() + # if gpus > 1: + # self.gpt2 = nn.DataParallel(self.gpt2) + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + try: + return self.gpt2.config.n_ctx + except AttributeError: + # gptneoconfig doesn't have n_ctx apparently + return self.gpt2.config.max_position_embeddings + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + # TODO: fix multi-gpu + return self.batch_size_per_gpu # * gpus + + @property + def device(self): + # TODO: fix multi-gpu + return self._device + + def tok_encode(self, string: str): + return self.tokenizer.encode(string, add_special_tokens=False) + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def _model_call(self, inps): + """ + inps: a torch tensor of shape [batch, sequence] + the size of sequence may vary from call to call + + returns: a torch tensor of shape [batch, sequence, vocab] with the + logits returned from the model + """ + with torch.no_grad(): + return self.gpt2(inps)[0] + + def _model_generate(self, context, max_length, eos_token_id): + return self.gpt2.generate( + context, + max_length=max_length, + eos_token_id=eos_token_id, + pad_token_id=eos_token_id, + do_sample=False, + ) + + +# for backwards compatibility +GPT2LM = HFLM diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py new file mode 100644 index 0000000000000000000000000000000000000000..55640ac28ac3539a171a83b4e4722363820c3a07 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py @@ -0,0 +1,233 @@ +import os +import numpy as np +import transformers +from lm_eval.base import BaseLM +from lm_eval import utils +from tqdm import tqdm +import time + + +def get_result(response, ctxlen): + """Process results from OpenAI API response. + + :param response: dict + OpenAI API Response + :param ctxlen: int + Length of context (so we can slice them away and only keep the predictions) + :return: + continuation_logprobs: np.array + Log probabilities of continuation tokens + is_greedy: bool + whether argmax matches given continuation exactly + """ + is_greedy = True + logprobs = response["logprobs"]["token_logprobs"] + continuation_logprobs = sum(logprobs[ctxlen:]) + + for i in range(ctxlen, len(response["logprobs"]["tokens"])): + token = response["logprobs"]["tokens"][i] + top_tokens = response["logprobs"]["top_logprobs"][i] + top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x]) + if top_token != token: + is_greedy = False + break + + return continuation_logprobs, is_greedy + + +def oa_completion(**kwargs): + """Query OpenAI API for completion. + + Retry with back-off until they respond + """ + import openai + + backoff_time = 3 + while True: + try: + return openai.Completion.create(**kwargs) + except openai.error.OpenAIError: + import traceback + + traceback.print_exc() + time.sleep(backoff_time) + backoff_time *= 1.5 + + +class GPT3LM(BaseLM): + REQ_CHUNK_SIZE = 20 + + def __init__(self, engine, truncate=False): + """ + + :param engine: str + OpenAI API engine (e.g. davinci) + :param truncate: bool + Truncate input if too long (if False and input is too long, throw error) + """ + super().__init__() + + import openai + + self.engine = engine + self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2") + + self.vocab_size = self.tokenizer.vocab_size + + # to make the annoying "Using pad_token, but it is not set yet." error go away + self.tokenizer.pad_token = "<|endoftext|>" + assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373] + self.truncate = truncate + self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids( + ["<|endoftext|>"] + )[0] + + # Read from environment variable OPENAI_API_SECRET_KEY + openai.api_key = os.environ["OPENAI_API_SECRET_KEY"] + + @property + def eot_token_id(self): + return self.tokenizer.eos_token_id + + @property + def max_length(self): + # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token + return 2048 + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + @property + def device(self): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def tok_encode(self, string: str): + return self.tokenizer.encode(string, add_special_tokens=False) + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def _loglikelihood_tokens(self, requests, disable_tqdm=False): + res = [] + + def _collate(x): + # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because + # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations + # we care about and so we need some kind of backup for when it isn't + toks = x[1] + x[2] + return -len(toks), tuple(toks) + + re_ord = utils.Reorderer(requests, _collate) + + for chunk in tqdm( + list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)), + disable=disable_tqdm, + ): + inps = [] + ctxlens = [] + for cache_key, context_enc, continuation_enc in chunk: + # max_length+1 because the API takes up to 2049 tokens, including the first context token + inp = (context_enc + continuation_enc)[-(self.max_length + 1) :] + # TODO: the logic is much simpler if we just look at the length of continuation tokens + ctxlen = len(context_enc) - max( + 0, len(context_enc) + len(continuation_enc) - (self.max_length + 1) + ) + + inps.append(inp) + ctxlens.append(ctxlen) + + response = oa_completion( + engine=self.engine, + prompt=inps, + echo=True, + max_tokens=0, + temperature=0.0, + logprobs=10, + ) + + for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip( + response.choices, ctxlens, chunk + ): + answer = get_result(resp, ctxlen) + + res.append(answer) + + # partial caching + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + + return re_ord.get_original(res) + + def greedy_until(self, requests): + if not requests: + return [] + res = [] + + def _collate(x): + toks = self.tok_encode(x[0]) + return len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + def sameuntil_chunks(xs, size): + ret = [] + lastuntil = xs[0][1] + for x in xs: + if len(ret) >= size or x[1] != lastuntil: + yield ret, lastuntil + ret = [] + lastuntil = x[1] + ret.append(x) + + if ret: + yield ret, lastuntil + + # todo: more intelligent batching for heterogeneous `until` + for chunk, until in tqdm( + list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)) + ): + inps = [] + for request in chunk: + context = request[0] + context_enc = self.tok_encode(context) + inp = context_enc[-(self.max_length - self.max_gen_toks) :] + inps.append(inp) + + response = oa_completion( + engine=self.engine, + prompt=inps, + max_tokens=self.max_gen_toks, + temperature=0.0, + logprobs=10, + stop=until, + ) + + for resp, request in zip(response.choices, chunk): + context = request[0] + until_ = request[1] + s = resp["text"] + + for term in until_: + s = s.split(term)[0] + + # partial caching + self.cache_hook.add_partial("greedy_until", (context, until_), s) + + res.append(s) + + return re_ord.get_original(res) + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override greedy_until + raise NotImplementedError() diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py new file mode 100644 index 0000000000000000000000000000000000000000..caa91a7a00dd324988728f74cb1b6cfe54087723 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py @@ -0,0 +1,740 @@ +import math +import torch +import torch.nn.functional as F +import transformers +import peft +from pathlib import Path +from typing import List, Mapping, NewType, Optional, Tuple, Union +from tqdm import tqdm + +from transformers import BatchEncoding + +from lm_eval import utils +from lm_eval.base import BaseLM + +TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] + +_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]]) + + +def _get_accelerate_args( + device_map_option: Optional[str] = "auto", + max_memory_per_gpu: Optional[Union[int, str]] = None, + max_cpu_memory: Optional[Union[int, str]] = None, + offload_folder: Optional[str] = "./offload", +) -> dict: + """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.""" + max_memory = {} + if max_memory_per_gpu is not None: + max_memory_per_gpu_map = { + device_idx: max_memory_per_gpu + for device_idx in range(torch.cuda.device_count()) + } + max_memory.update(max_memory_per_gpu_map) + if max_cpu_memory is not None: + max_memory["cpu"] = max_cpu_memory + + args = {} + if max_memory: + args["max_memory"] = max_memory + args["device_map"] = device_map_option + args["offload_folder"] = offload_folder + return args + + +def _get_dtype( + dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None +) -> torch.dtype: + """Converts `dtype` from `str` to torch.dtype when possible.""" + if dtype is None and config is not None: + _torch_dtype = config.torch_dtype + elif isinstance(dtype, str) and dtype != "auto": + # Convert `str` args torch dtype: `float16` -> `torch.float16` + _torch_dtype = getattr(torch, dtype) + else: + _torch_dtype = dtype + return _torch_dtype + + +class HuggingFaceAutoLM(BaseLM): + AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig + AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer + AUTO_MODEL_CLASS: transformers.AutoModel = None + AUTO_PEFT_CLASS: peft.PeftModel = None + + # Default max sequence length setting for when no `max_length` is provided + # or no max length config setting is found in the model or tokenizer. + _DEFAULT_MAX_LENGTH: int = 2048 + + def __init__( + self, + pretrained: str, + quantized: Optional[Union[bool, str]] = None, + tokenizer: Optional[str] = None, + subfolder: Optional[str] = None, + revision: Optional[str] = "main", + batch_size: Optional[int] = 1, + max_gen_toks: Optional[int] = 256, + max_length: Optional[int] = None, + add_special_tokens: Optional[bool] = None, + use_accelerate: Optional[bool] = False, + device_map_option: Optional[str] = "auto", + max_memory_per_gpu: Optional[Union[int, str]] = None, + max_cpu_memory: Optional[Union[int, str]] = None, + offload_folder: Optional[str] = "./offload", + dtype: Optional[Union[str, torch.dtype]] = None, + device: Optional[Union[int, str]] = "cuda", + peft: str = None, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + use_fast: Optional[bool] = True, + gptq_use_triton: Optional[bool] = False, + ): + """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation. + Args: + pretrained (str): + The HuggingFace Hub model ID name or the path to a pre-trained + model to load. This is effectively the `pretrained_model_name_or_path` + argument of `from_pretrained` in the HuggingFace `transformers` API. + quantized (str or True, optional, defaults to None): + File name of a GPTQ quantized model to load. Set to `True` to use the + default name of the quantized model. + add_special_tokens (bool, optional, defaults to True): + Whether to add special tokens to the input sequences. If `None`, the + default value will be set to `True` for seq2seq models (e.g. T5) and + `False` for causal models. + WARNING: Evaluating causal models with `add_special_tokens=True` is + currently __not__ supported. + > Large model loading `accelerate` arguments + use_accelerate (bool, optional, defaults to False): + If True, uses the `accelerate` library to load a large model across + multiple devices. + device_map_option (str, optional, defaults to "auto"): + The device map option to use when loading the model with + `accelerate`. + Options: + "auto", "balanced", "balanced_low_0", "sequential" + See the `accelerate` docs for more details on these options: + https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map + max_memory_per_gpu (Union[int, str], optional, defaults to None): + The maximum memory available for each GPU in bytes as `int` or in + the format f"{significand}{unit_symbol}" where {unit_symbol} is + any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in + the "Parameters for big model inference" section of the following + docs: + https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory + max_cpu_memory (Union[int, str], optional, defaults to None): + The maximum available CPU RAM in bytes as `int` or in the format + f"{significand}{unit_symbol}" where {unit_symbol} is any of + ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the + "Parameters for big model inference" section of the following docs: + https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory + offload_folder (str, optional, defaults to "./offload"): + The folder to offload weights into if `device_map` contains any + "disk" value. + dtype (Union[str, torch.dtype], optional, defaults to None):): + Converts the model weights to `dtype`, if specified. Strings get + converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`). + Use `dtype="auto"` to derive the type from the model’s weights. + peft (str, optional, defaults to None): + Path of the adapter weights to load from Huggingface. This will usually + include a directory that includes the files `adapter_config.json` and + `adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft) + load_in_8bit (bool, optional, defaults to False): + If True, will convert the loaded model into mixed-8bit quantized model. See: + https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit + trust_remote_code (bool, optional, defaults to False): + If True, will trust the remote code when loading the model. + use_fast (bool, optional, defaults to True): + If True, will use the fast tokenizer when loading the model. + gptq_use_triton (bool, optional, defaults to False): + Use Triton for GPTQ inference. + """ + super().__init__() + + assert isinstance(pretrained, str) + assert isinstance(device, str) + assert isinstance(batch_size, int) + if ( + add_special_tokens is not None + and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM + ): + # TODO: Support evaluating causal models with special tokens. Currently, + # this is not possible because the `_loglikelihood_tokens()` method for + # causal LMs makes a no-special-tokens assumption given that contexts + # and labels/continuations are tokenized separately without special + # tokens, concatenated, and then processed as inputs. + assert ( + not add_special_tokens + ), "Evaluating causal models with `add_special_tokens=True` is currently not supported." + + self._batch_size = batch_size # TODO: Adaptive batch size + self._max_gen_toks = max_gen_toks + self._max_length = max_length + self._config = self.AUTO_CONFIG_CLASS.from_pretrained( + pretrained, + trust_remote_code=trust_remote_code, + revision=revision + ("/" + subfolder if subfolder is not None else ""), + ) + + self._add_special_tokens = add_special_tokens + self.tokenizer = self._create_auto_tokenizer( + pretrained=pretrained, + revision=revision, + subfolder=subfolder, + tokenizer=tokenizer, + use_fast=use_fast, + ) + self.tokenizer.model_max_length = self.max_length + + model_kwargs = {} + if use_accelerate: + model_kwargs = _get_accelerate_args( + device_map_option, + max_memory_per_gpu, + max_cpu_memory, + offload_folder, + ) + model_kwargs["load_in_8bit"] = load_in_8bit + self.model = self._create_auto_model( + pretrained=pretrained, + quantized=quantized, + trust_remote_code=trust_remote_code, + revision=revision, + subfolder=subfolder, + torch_dtype=_get_dtype(dtype, self._config), + gptq_use_triton=gptq_use_triton, + **model_kwargs, + ) + # note: peft_path can be different than pretrained model path + if peft is not None: + self.model = self._create_auto_model_peft( + model=self.model, + peft=peft, + revision=revision, + subfolder=subfolder, + torch_dtype=_get_dtype(dtype, self._config), + **model_kwargs, + ) + self.model.eval() + torch.set_grad_enabled(False) + + self._device = device + if use_accelerate and "lm_head" in self.model.hf_device_map: + # `accelerate` can place `lm_head` weights on a different device than + # the user specified one so we force `self._device` to be the same as + # `lm_head`'s. + self._device = self.model.hf_device_map["lm_head"] + if not use_accelerate and not load_in_8bit: + try: + self.model.to(self._device) + except: # noqa: E722 + print( + "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore." + ) + + def _create_auto_model( + self, + *, + pretrained: str, + quantized: Optional[Union[bool, str]] = None, + revision: str, + subfolder: str, + device_map: Optional[Union[str, _DeviceMapping]] = None, + max_memory: Optional[dict] = None, + offload_folder: Optional[str] = None, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + torch_dtype: Optional[Union[str, torch.dtype]] = None, + gptq_use_triton: Optional[bool] = False, + ) -> transformers.AutoModel: + """Returns a pre-trained pytorch model from a pre-trained model configuration.""" + if quantized is None: + model = self.AUTO_MODEL_CLASS.from_pretrained( + pretrained, + revision=revision + ("/" + subfolder if subfolder is not None else ""), + device_map=device_map, + max_memory=max_memory, + offload_folder=offload_folder, + load_in_8bit=load_in_8bit, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + ) + else: + from auto_gptq import AutoGPTQForCausalLM + + model = AutoGPTQForCausalLM.from_quantized( + pretrained, + model_basename=None if quantized is True else Path(quantized).stem, + device_map=device_map, + max_memory=max_memory, + trust_remote_code=trust_remote_code, + use_safetensors=True + if quantized is True + else quantized.endswith(".safetensors"), + use_triton=gptq_use_triton, + warmup_triton=gptq_use_triton, + ) + return model + + def _create_auto_model_peft( + self, + *, + model: transformers.PreTrainedModel, + peft: str, + revision: str, + subfolder: str, + device_map: Optional[Union[str, _DeviceMapping]] = None, + max_memory: Optional[dict] = None, + offload_folder: Optional[str] = None, + load_in_8bit: Optional[bool] = False, + trust_remote_code: Optional[bool] = False, + torch_dtype: Optional[Union[str, torch.dtype]] = None, + ): + model = self.AUTO_PEFT_CLASS.from_pretrained( + model, + peft, + revision=revision + ("/" + subfolder if subfolder is not None else ""), + device_map=device_map, + max_memory=max_memory, + offload_folder=offload_folder, + load_in_8bit=load_in_8bit, + trust_remote_code=trust_remote_code, + torch_dtype=torch_dtype, + ) + return model + + def _create_auto_tokenizer( + self, + *, + pretrained: str, + revision: str, + subfolder: str, + tokenizer: Optional[str] = None, + use_fast: Optional[bool] = True, + ) -> transformers.PreTrainedTokenizer: + """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration.""" + tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained( + pretrained if tokenizer is None else tokenizer, + revision=revision + ("/" + subfolder if subfolder is not None else ""), + use_fast=use_fast, + ) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + @property + def add_special_tokens(self) -> bool: + """Whether to include special tokens in encoded text. This should be + determined by whether or not the model was trained with special tokens. + TODO: Remove these conditionals once HuggingFace supports a way to + check whether or not an arbitrary model was trained with special tokens. + """ + if self._add_special_tokens is not None: + return self._add_special_tokens + elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM: + return False + elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM: + return True + else: + raise ValueError( + "Could not determine `add_special_tokens` value from the model " + "class. Set to `True` or `False` depending on whether the model " + "was pre-trained with special tokens." + ) + + @property + def eot_token(self) -> str: + return self.tokenizer.eos_token + + @property + def eot_token_id(self) -> int: + return self.tokenizer.eos_token_id + + @property + def max_gen_toks(self) -> int: + return self._max_gen_toks + + @property + def max_length(self) -> int: + """Return the maximum sequence length of the model. + NOTE: Different model configurations have different max sequence length + attribute names. + - n_positions: (CTRLConfig) + - max_position_embeddings: (BartConfig, RoFormerConfig) + - n_ctx: (GPT2Config) + NOTE: For relative position encoded models you should specify the max + sequence length of the model in the constructor via `max_length`. + """ + if self._max_length is not None: + return self._max_length + # Try to get the sequence length from the model config. + seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") + for attr in seqlen_config_attrs: + if hasattr(self._config, attr): + return getattr(self._config, attr) + if hasattr(self.tokenizer, "model_max_length"): + return self.tokenizer.model_max_length + return self._DEFAULT_MAX_LENGTH + + @property + def batch_size(self) -> int: + # TODO: Add adaptive batch size. + return self._batch_size # * gpus + + @property + def device(self) -> Union[int, str, torch.device]: + return self._device + + def tok_encode(self, string: str) -> TokenSequence: + # TODO: Merge `tok_encode_batch` here. + return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens) + + def tok_encode_batch(self, strings: List[str]) -> TokenSequence: + return self.tokenizer( + strings, + padding=True, + add_special_tokens=self.add_special_tokens, + return_tensors="pt", + ) + + def tok_decode(self, tokens: torch.LongTensor) -> List[str]: + return self.tokenizer.batch_decode(tokens, skip_special_tokens=True) + + def greedy_until( + self, requests: List[Tuple[str, Union[List[str], str]]] + ) -> List[str]: + def _collate(x): + tokens = self.tok_encode(x[0]) + return len(tokens), x[0] + + results = [] + reorder = utils.Reorderer(requests, _collate) + for chunk in utils.chunks( + tqdm(reorder.get_reordered(), disable=False), self.batch_size + ): + context = [c[0] for c in chunk] + request_args = chunk[0][1] + stop_sequences = ( + request_args if isinstance(request_args, list) else [request_args] + ) # request_args["stop_sequences"] + max_generation_length = ( + self._max_gen_toks + ) # request_args["max_generation_length"] + + assert ( + isinstance(max_generation_length, int) or max_generation_length is None + ) + assert isinstance(stop_sequences, list) or stop_sequences is None + + # TODO: Find a better way to handle stop sequences for 0-shot. + if stop_sequences is None: + until = [self.eot_token] + else: + until = stop_sequences + [self.eot_token] + + if max_generation_length is None: + max_tokens = self.max_gen_toks + else: + max_tokens = max_generation_length + + token_context = self.tok_encode_batch(context) + + responses = self._model_generate( + inputs=token_context, + max_tokens=max_tokens, + stop=until, + ) + responses = self.tok_decode(responses.tolist()) + + for response in responses: + # Ensure the generated responses do not contain the stop sequences. + for term in until: + response = response.split(term)[0] + # partial caching + self.cache_hook.add_partial("greedy_until", (context, until), response) + results.append(response) + return reorder.get_original(results) + + +class AutoCausalLM(HuggingFaceAutoLM): + """Causal language modeling. + You can find a set of supported models in the HF documentation: + https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM + """ + + AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM + AUTO_PEFT_CLASS = peft.PeftModel + + def _create_auto_tokenizer( + self, + *, + pretrained: str, + revision: str, + subfolder: str, + tokenizer: Optional[str] = None, + use_fast: Optional[bool] = True, + ) -> transformers.PreTrainedTokenizer: + tokenizer = super()._create_auto_tokenizer( + pretrained=pretrained, + revision=revision, + subfolder=subfolder, + tokenizer=tokenizer, + use_fast=use_fast, + ) + tokenizer.padding_side = "left" + return tokenizer + + def _model_call( + self, inputs: TokenSequence, labels: Optional[TokenSequence] = None + ) -> TokenSequence: + return self.model(inputs)["logits"] + + def _model_generate( + self, + inputs: transformers.BatchEncoding, + max_tokens: int, + stop: Optional[List[str]] = None, + ) -> TokenSequence: + # Ensure that the context does not encroach into the `space` + # for the generation. + input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :] + attention_mask = inputs["attention_mask"][ + :, self.max_gen_toks - self.max_length : + ] + input_ids = input_ids.to(self.device) + attention_mask = attention_mask.to(self.device) + + stopping_criteria = stop_sequences_criteria( + self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0] + ) + + generations = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + # GPT style models require the `generate` `max_length` arg to include the + # context length, so we instead set `max_new_tokens` which is the number + # of new tokens to generate, excluding the current number of tokens. + max_new_tokens=max_tokens, + stopping_criteria=stopping_criteria, + do_sample=False, + ) + return utils.select_continuation_from_batch_left_padding( + generations, max_context_size=inputs["input_ids"].size(1) + ) + + +class AutoSeq2SeqLM(HuggingFaceAutoLM): + """Seq2Seq language modeling. + You can find a set of supported models in the following documentation: + https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM + """ + + AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM + AUTO_PEFT_CLASS = peft.PeftModel + + @property + def max_length(self) -> int: + """Return the maximum sequence length of the model. + TODO: Currently only works for relative position encoded Seq2Seq models. + """ + if self._max_length is not None: + return self._max_length + return self._DEFAULT_MAX_LENGTH + + def loglikelihood( + self, requests: List[Tuple[str, str]] + ) -> List[Tuple[float, bool]]: + new_requests = [] + for chunk in utils.chunks(requests, self.batch_size): + context, continuation = zip(*chunk) + + # Fill empty contexts with the EOT token. + context = [ + f"{self.eot_token}" if len(text) == 0 else text for text in context + ] + context_enc = self.tok_encode_batch(context) + for key in context_enc: + context_enc[key] = context_enc[key][:, -self.max_length :] + + # Remove leading whitespace introduced by the default + # `text_target_separator` since the context and continuation + # will not be concatenated as a single (decoder) input. + continuation = [text.lstrip() for text in continuation] + continuation_enc = self.tok_encode_batch(list(continuation)) + for key in continuation_enc: + continuation_enc[key] = continuation_enc[key][:, -self.max_length :] + + new_requests.append( + ((context, continuation), context_enc, continuation_enc) + ) + return self._loglikelihood_tokens(new_requests) + + def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]: + loglikelihoods = [] + for (string,) in tqdm(requests): + rolling_token_windows = list( + map( + utils.make_disjoint_window, + utils.get_rolling_token_windows( + token_list=self.tok_encode(string), + prefix_token=self.eot_token_id, + max_seq_len=self.max_length, + context_len=1, + ), + ) + ) + contexts, conts = utils.split_and_pad_windows( + rolling_token_windows, + pad_token_id=self.eot_token_id, + max_seq_len=self.max_length, + ) + # Manually create BatchEncoding tensors with attention masks as + # expected by `self._model_call` in `self._loglikelihood_tokens`. + contexts_enc = torch.Tensor(contexts).long() + contexts_enc = transformers.tokenization_utils_base.BatchEncoding( + { + "input_ids": contexts_enc, + "attention_mask": (contexts_enc != self.eot_token_id).long(), + } + ) + conts_enc = torch.Tensor(conts).long() + conts_enc = transformers.tokenization_utils_base.BatchEncoding( + { + "input_ids": conts_enc, + "attention_mask": (conts_enc != self.eot_token_id).long(), + } + ) + # TODO: Extract out this call so it only gets called once and also + # somehow figure out partial caching for. + rolling_token_windows_request = [ + ((contexts, conts), contexts_enc, conts_enc) + ] + string_nll = self._loglikelihood_tokens( + rolling_token_windows_request, disable_tqdm=True + ) + string_nll = [x[0] for x in string_nll] # discard is_greedy + string_nll = sum(string_nll) + loglikelihoods.append(string_nll) + return loglikelihoods + + def _loglikelihood_tokens( + self, + requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]], + disable_tqdm: Optional[bool] = False, + ) -> List[Tuple[float, bool]]: + results = [] + for chunk in tqdm( + requests, total=math.ceil(len(requests)), disable=disable_tqdm + ): + cache_keys, inputs_tokens, targets_tokens = chunk + inputs_tokens = inputs_tokens.to(self.device) + targets_tokens = targets_tokens.to(self.device) + outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens) + log_softmaxes = F.log_softmax(outputs.logits, dim=-1) + + output_iterator = zip( + zip(cache_keys[0], cache_keys[1]), + log_softmaxes, + targets_tokens["input_ids"], + targets_tokens["attention_mask"], + ) + for cache_key, log_softmax, target_tokens, target_mask in output_iterator: + length = target_mask.sum() + log_softmax = log_softmax[:length] + target_tokens = target_tokens[:length] + greedy_tokens = log_softmax.argmax(dim=-1) + max_equal = (greedy_tokens == target_tokens).all() + target_logits = torch.gather( + log_softmax, 1, target_tokens.unsqueeze(-1) + ).squeeze(-1) + answer = (float(target_logits.sum()), bool(max_equal)) + results.append(answer) + if cache_key is not None: + self.cache_hook.add_partial("loglikelihood", cache_key, answer) + return results + + def _model_call( + self, inputs: TokenSequence, labels: Optional[TokenSequence] = None + ) -> TokenSequence: + return self.model(**inputs, labels=labels["input_ids"]) + + def _model_generate( + self, + inputs: transformers.BatchEncoding, + max_tokens: int, + stop: Optional[List[str]] = None, + ) -> TokenSequence: + input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device) + attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device) + + # Generate one token to calculate the number of start tokens prepended to decoder_input_ids + # (leaving this here in case the below assumption is violated in the future) + # one_tok_gen = self.model.generate( + # input_ids=torch.zeros((1, 1), dtype=torch.int), + # min_length=2, + # max_new_tokens=1, + # ).squeeze() + # initial_decoder_input_length = len(one_tok_gen) - 1 + + # Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models + stopping_criteria = stop_sequences_criteria( + self.tokenizer, stop, 1, input_ids.shape[0] + ) + + generations = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_tokens, + stopping_criteria=stopping_criteria, + do_sample=False, + ) + return generations + + +class MultiTokenEOSCriteria(transformers.StoppingCriteria): + """Criteria to stop on the specified multi-token sequence.""" + + def __init__( + self, + sequence: str, + tokenizer: transformers.PreTrainedTokenizer, + initial_decoder_input_length: int, + batch_size: int, + ): + self.initial_decoder_input_length = initial_decoder_input_length + self.done_tracker = [False] * batch_size + self.sequence = sequence + self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False) + self.sequence_id_len = len(self.sequence_ids) + self.tokenizer = tokenizer + + def __call__(self, input_ids, scores, **kwargs) -> bool: + # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence + lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][ + :, -self.sequence_id_len : + ] + + lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch) + + for i, done in enumerate(self.done_tracker): + if not done: + self.done_tracker[i] = self.sequence in lookback_tokens_batch[i] + return False not in self.done_tracker + + +def stop_sequences_criteria( + tokenizer: transformers.PreTrainedTokenizer, + stop_sequences: List[str], + initial_decoder_input_length: int, + batch_size: int, +) -> transformers.StoppingCriteriaList: + return transformers.StoppingCriteriaList( + [ + *[ + MultiTokenEOSCriteria( + sequence, tokenizer, initial_decoder_input_length, batch_size + ) + for sequence in stop_sequences + ], + ] + ) diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py new file mode 100644 index 0000000000000000000000000000000000000000..bc80cc9969583ad15928de4074e69a4ba4ea9539 --- /dev/null +++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py @@ -0,0 +1,155 @@ +""" TextSynth API +Implementation provided by Fabrice Bellard: + https://github.com/EleutherAI/lm-evaluation-harness/issues/295 + +In order to use the API, you must have a valid TextSynth account and +enough credits. + +Example usage: + + python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa + +Homepage: https://textsynth.com/index.html +""" +import logging +import os +import requests as _requests +import time +from tqdm import tqdm +from lm_eval.base import BaseLM + + +logger = logging.getLogger(__name__) + + +def textsynth_completion(**kwargs): + """Query TextSynth API for completion. + Retry with back-off until they respond. + """ + backoff_time = 3 + while True: + try: + return _requests.post(**kwargs) + except _requests.exceptions.RequestException: + import traceback + + traceback.print_exc() + time.sleep(backoff_time) + backoff_time *= 1.5 + + +class TextSynthLM(BaseLM): + def __init__(self, engine, truncate=False): + """ + :param engine: str + TextSynth API engine (e.g. `gptj_6B`) + :param truncate: bool + Truncate input if too long (if False and input is too long, throw error) + """ + super().__init__() + + self.engine = engine + self.truncate = truncate + self.api_url = "https://api.textsynth.com" + # Read from environment variable TEXTSYNTH_API_SECRET_KEY + self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"] + + @property + def eot_token_id(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until + raise NotImplementedError() + + @property + def max_length(self): + # NOTE: Turn on truncation to avoid errors on long inputs. + return 2048 + + @property + def max_gen_toks(self): + return 256 + + @property + def batch_size(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until + raise NotImplementedError() + + @property + def device(self): + # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until + raise NotImplementedError() + + def tok_encode(self, string: str): + # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until + raise NotImplementedError() + + def tok_decode(self, tokens): + # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until + raise NotImplementedError() + + def loglikelihood(self, requests): + res = [] + for context, continuation in tqdm(requests): + response = textsynth_completion( + url=self.api_url + "/v1/engines/" + self.engine + "/logprob", + headers={"Authorization": "Bearer " + self.api_key}, + json={"context": context, "continuation": continuation}, + ) + resp = response.json() + if "logprob" in resp: + logprob = resp["logprob"] + is_greedy = resp["is_greedy"] + res.append((logprob, is_greedy)) + else: + logger.error( + f"The following response does not contain `logprobs`. Got:\n{resp}" + ) + assert False + return res + + def loglikelihood_rolling(self, requests): + # TODO: The TextSynth API does not support tokenized inputs so we cannot + # manually partition long contexts into smaller rolling windows as + # done for other models derived from `BaseLM`. Override this method + # with a windowing scheme that works for direct string inputs. + raise NotImplementedError( + "`loglikelihood_rolling` is currently not supported due to lack of " + "input tokenization support from TextSynth." + ) + + def greedy_until(self, requests): + if not requests: + return [] + + res = [] + for request in tqdm(requests): + inp = request[0] + until = request[1] + response = textsynth_completion( + url=self.api_url + "/v1/engines/" + self.engine + "/completions", + headers={"Authorization": "Bearer " + self.api_key}, + json={ + "prompt": inp, + "max_tokens": self.max_gen_toks, + "top_k": 1, + "stop": until, + }, + ) + resp = response.json() + if "text" in resp: + s = resp["text"] + res.append(s) + else: + logger.error( + f"The following response does not contain generated `text`. " + "Got:\n{resp}" + ) + assert False + return res + + def _model_call(self, inps): + # Isn't used because we override _loglikelihood_tokens + raise NotImplementedError() + + def _model_generate(self, context, max_length, eos_token_id): + # Isn't used because we override greedy_until + raise NotImplementedError() diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0c9b16aabee5e2c366aea0d978fbb342a9ee4dc Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30001b12a59b16ff95c268a37aef518d228aa8a2 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9457bf8b297e5410475ff758f8ec30a0f898e419 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b3e68270a96f21b3954109382937d802deedcd9 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d06c230d9b118bb8c9ef7dac8af023f547e4fcd7 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bae86c01f8da82f90bd4ee847c0f37b0bcad305a Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edc380ab18a8a97d1320a37db8a8803f86a38539 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a77764377214f72b90279fea25908e444ff6102f Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65ad1a119701f3952d853fe5d4c77c6ea2005a65 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d84daf8c2faef6c300209302755ebc4d893290f0 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a560e5b90a3a6ec5b038aa990f960da1b5544780 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6edfd0eab0f2a33d61c2df220f398e3a0211e5da Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b000ee58e62d3ded646b13259a7e05d7381c51c Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6aa89cb0d5a32fa197feb2acd16dd3a8e24a46a Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c27b87d87b348eba6cda243ac23eed9a87efbc1 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1d7136fb0b8b8a918fe578ed1fe0513e87338f5 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02215e3807397be1780b82a2c1fff6e7b49b07f8 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d97a2ceb3a230786e552b61aa72b4995afee9a4c Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f122e1005a95490469ceb9f4076092bfc5f416ab Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..157ca9c1caa5a42a2e1f8adb77497649f429a002 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb308a754c3663071e113715991b50f901ce9c89 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4c232e9606134332a7da2ff3749c44c374a5222f Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5834baad11e4e6e897bb5a92f5e8b130176aeed7 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a63fe17da168733592e9f80746732403a6b1519 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b01e0ad08b0a0d97e1ba1d79ae99a4c4c2c4acc Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d170cfdd4457e36f99460189428cab4c89c0f4c Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a45a715d31a32794150ad95ac47a08f6b4ababb2 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de7024a36676c3e3e51a5cad5c5200b0784ed3e4 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02256d0acaf080d97d00e4aea8108434ca5469cb Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d24ff3c2e3944ede5340997b207c5a912ee7ccdc Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b9b9b2b0965c51afcd0ccb17a89789cb056fae6 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80db4975f7fdbee55be6b211ed938ca64aa7b7f0 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85fd1fbc813fac61aad6a9266b21c71db5cbe952 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89d0a76977c581396622c3e36505be0d8072d282 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a018ac9bb301c3579adf1cea58256bd31d86b2f Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ee97fe6b70c5c6a84e0e6d7bbc6f574cc1092df Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d3326820193e08518dc3294ebbad60f5e5d51ea Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7087dcea3370bd5aba29f5843f36e42c8993f3c Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06c7c875e7942bc6da39b546cb3de5c8c0ca3673 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3377a2eff2a030e633b5ded11d1b658d0f41ac78 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1fe3970ef197900219c330f5dc35dbf7a2709c37 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e42ac9313a6bf84323cec6e4559f5e063de3acc Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc differ diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..579e0e00035180b18c49ea5e85103bb405da5ec9 Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc differ