diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4834200937f4ece26e98c677948ce315925e9811
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3f09999ba3c44646bef7d02361f12932f268805
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/base.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f07efc2f2deb0b4118ba473a5517ad0ad6cc915d
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/evaluator.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9099b40d803811a8a8fb6d97b5efecd1c5751ab7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/metrics.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51a3c608b71eb71df22b50e61995123d6ef621a7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/__pycache__/utils.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..01a026359725c91cd984633f07681c046bc0d2ae
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/README.md
@@ -0,0 +1,8 @@
+# datasets
+
+This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
+
+__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
+
+
+__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee1306748cb4d983e3dcac4733490edba772a13e
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f91c157735f8450d605b7a3b45b02d026e954e93
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/__pycache__/drop.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..f155e7720d0aa5e330496c4f945f7c047424cc61
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/dataset_infos.json
@@ -0,0 +1 @@
+{"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..d892427b48f82dfb59b33f6844a7a5519ab09cf2
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/drop/drop.py
@@ -0,0 +1,192 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Custom DROP dataset that, unlike HF, keeps all question-answer pairs
+# even if there are multiple types of answers for the same question.
+"""DROP dataset."""
+
+
+import json
+import os
+
+import datasets
+
+
+_CITATION = """\
+@misc{dua2019drop,
+    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
+    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
+    year={2019},
+    eprint={1903.00161},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """\
+DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
+this crowdsourced, adversarially-created, 96k question-answering benchmark, a
+system must resolve multiple references in a question, map them onto a paragraph,
+and perform discrete operations over them (such as addition, counting, or sorting).
+"""
+
+_HOMEPAGE = "https://allenai.org/data/drop"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = {
+    "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
+}
+
+_EMPTY_VALIDATED_ANSWER = [
+    {
+        "number": "",
+        "date": {
+            "day": "",
+            "month": "",
+            "year": "",
+        },
+        "spans": [],
+        "worker_id": "",
+        "hit_id": "",
+    }
+]
+
+
+class Drop(datasets.GeneratorBasedBuilder):
+    """DROP is a QA dataset which tests comprehensive understanding of paragraphs."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="drop", version=VERSION, description="The DROP dataset."
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "section_id": datasets.Value("string"),
+                "passage": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "query_id": datasets.Value("string"),
+                "answer": {
+                    "number": datasets.Value("string"),
+                    "date": {
+                        "day": datasets.Value("string"),
+                        "month": datasets.Value("string"),
+                        "year": datasets.Value("string"),
+                    },
+                    "spans": datasets.features.Sequence(datasets.Value("string")),
+                    "worker_id": datasets.Value("string"),
+                    "hit_id": datasets.Value("string"),
+                },
+                "validated_answers": datasets.features.Sequence(
+                    {
+                        "number": datasets.Value("string"),
+                        "date": {
+                            "day": datasets.Value("string"),
+                            "month": datasets.Value("string"),
+                            "year": datasets.Value("string"),
+                        },
+                        "spans": datasets.features.Sequence(datasets.Value("string")),
+                        "worker_id": datasets.Value("string"),
+                        "hit_id": datasets.Value("string"),
+                    }
+                ),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS[self.config.name]
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "drop_dataset", "drop_dataset_train.json"
+                    ),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "drop_dataset", "drop_dataset_dev.json"
+                    ),
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)
+            key = 0
+            for section_id, example in data.items():
+                # Each example (passage) has multiple sub-question-answer pairs.
+                for qa in example["qa_pairs"]:
+                    # Build answer.
+                    answer = qa["answer"]
+                    answer = {
+                        "number": answer["number"],
+                        "date": {
+                            "day": answer["date"].get("day", ""),
+                            "month": answer["date"].get("month", ""),
+                            "year": answer["date"].get("year", ""),
+                        },
+                        "spans": answer["spans"],
+                        "worker_id": answer.get("worker_id", ""),
+                        "hit_id": answer.get("hit_id", ""),
+                    }
+                    validated_answers = []
+                    if "validated_answers" in qa:
+                        for validated_answer in qa["validated_answers"]:
+                            va = {
+                                "number": validated_answer.get("number", ""),
+                                "date": {
+                                    "day": validated_answer["date"].get("day", ""),
+                                    "month": validated_answer["date"].get("month", ""),
+                                    "year": validated_answer["date"].get("year", ""),
+                                },
+                                "spans": validated_answer.get("spans", ""),
+                                "worker_id": validated_answer.get("worker_id", ""),
+                                "hit_id": validated_answer.get("hit_id", ""),
+                            }
+                            validated_answers.append(va)
+                    else:
+                        validated_answers = _EMPTY_VALIDATED_ANSWER
+                    yield key, {
+                        "section_id": section_id,
+                        "passage": example["passage"],
+                        "question": qa["question"],
+                        "query_id": qa["query_id"],
+                        "answer": answer,
+                        "validated_answers": validated_answers,
+                    }
+                    key += 1
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f52fc50cf67eca96522c73fb069fdaeaacfa97e
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/hendrycks_math.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..27d154efa50fd68aa23d3de656c9ce6449faed61
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/dataset_infos.json
@@ -0,0 +1 @@
+{"algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 955021, "num_examples": 1744, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 648291, "num_examples": 1187, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1603312, "size_in_bytes": 21931248}, "counting_and_probability": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "counting_and_probability", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 667385, "num_examples": 771, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 353803, "num_examples": 474, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1021188, "size_in_bytes": 21349124}, "geometry": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "geometry", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1077241, "num_examples": 870, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 523126, "num_examples": 479, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1600367, "size_in_bytes": 21928303}, "intermediate_algebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "intermediate_algebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1157476, "num_examples": 1295, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 795070, "num_examples": 903, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1952546, "size_in_bytes": 22280482}, "number_theory": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "number_theory", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 595793, "num_examples": 869, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 349455, "num_examples": 540, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 945248, "size_in_bytes": 21273184}, "prealgebra": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "prealgebra", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 715611, "num_examples": 1205, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 510195, "num_examples": 871, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1225806, "size_in_bytes": 21553742}, "precalculus": {"description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.\n", "citation": "@article{hendrycksmath2021,\n  title={Measuring Mathematical Problem Solving With the Math Dataset},\n  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},\n  journal={NeurIPS},\n  year={2021}\n}\n", "homepage": "https://github.com/hendrycks/math", "license": "", "features": {"problem": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "solution": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_math", "config_name": "precalculus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 816245, "num_examples": 746, "dataset_name": "hendrycks_math"}, "test": {"name": "test", "num_bytes": 552893, "num_examples": 546, "dataset_name": "hendrycks_math"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/MATH.tar": {"num_bytes": 20327936, "checksum": "0fbe4fad0df66942db6c221cdcc95b298cc7f4595a2f0f518360cce84e90d9ac"}}, "download_size": 20327936, "post_processing_size": null, "dataset_size": 1369138, "size_in_bytes": 21697074}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py
new file mode 100644
index 0000000000000000000000000000000000000000..043adeeed6648ce04e3209b6359cb119699eddd1
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/hendrycks_math.py
@@ -0,0 +1,122 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MATH dataset."""
+
+
+import json
+import os
+import pathlib
+
+import datasets
+
+
+_CITATION = """\
+@article{hendrycksmath2021,
+  title={Measuring Mathematical Problem Solving With the Math Dataset},
+  author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
+  journal={NeurIPS},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+MATH is a dataset of 12,500 challenging competition mathematics problems. Each
+problem in Math has a full step-by-step solution which can be used to teach
+models to generate answer derivations and explanations.
+"""
+
+_HOMEPAGE = "https://github.com/hendrycks/math"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
+
+_NAMES = [
+    "algebra",
+    "counting_and_probability",
+    "geometry",
+    "intermediate_algebra",
+    "number_theory",
+    "prealgebra",
+    "precalculus",
+]
+
+
+class HendrycksMath(datasets.GeneratorBasedBuilder):
+    """MATH is a dataset of 12,500 challenging competition mathematics problems."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name=name, version=version, description=name)
+        for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "problem": datasets.Value("string"),
+                "level": datasets.Value("string"),
+                "type": datasets.Value("string"),
+                "solution": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "train", self.config.name
+                    ),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "basepath": os.path.join(
+                        data_dir, "MATH", "test", self.config.name
+                    ),
+                    "split": "test",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, basepath, split):
+        key = 0
+        for file in sorted(pathlib.Path(basepath).iterdir()):
+            with open(file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                yield key, {
+                    "problem": data["problem"],
+                    "level": data["level"],
+                    "type": data["type"],
+                    "solution": data["solution"],
+                }
+                key += 1
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py
new file mode 100644
index 0000000000000000000000000000000000000000..488a55dd7352d333d68a3557058ca81dfe704a38
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/archiver.py
@@ -0,0 +1,161 @@
+import os
+import zstandard
+import json
+import jsonlines
+import io
+import datetime
+import mmap
+import tqdm
+from pathlib import Path
+
+
+def json_serial(obj):
+    """JSON serializer for objects not serializable by default json code"""
+
+    if isinstance(obj, (datetime.datetime,)):
+        return obj.isoformat()
+    raise TypeError("Type %s not serializable" % type(obj))
+
+
+# Modified version of lm_dataformat Archive for single file.
+class Archive:
+    def __init__(self, file_path, compression_level=3):
+        self.file_path = file_path
+        dir_name = os.path.dirname(file_path)
+        if dir_name:
+            os.makedirs(dir_name, exist_ok=True)
+        self.fh = open(self.file_path, "wb")
+        self.cctx = zstandard.ZstdCompressor(level=compression_level)
+        self.compressor = self.cctx.stream_writer(self.fh)
+
+    def add_data(self, data, meta={}):
+        self.compressor.write(
+            json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
+                "UTF-8"
+            )
+            + b"\n"
+        )
+
+    def commit(self):
+        self.compressor.flush(zstandard.FLUSH_FRAME)
+        self.fh.flush()
+        self.fh.close()
+
+
+# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
+class Reader:
+    def __init__(self):
+        pass
+
+    def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
+        with open(file, "rb") as fh:
+            self.fh = fh
+            cctx = zstandard.ZstdDecompressor()
+            reader = io.BufferedReader(cctx.stream_reader(fh))
+            rdr = jsonlines.Reader(reader)
+            for ob in rdr:
+                # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
+                if isinstance(ob, str):
+                    assert not get_meta
+                    yield ob
+                    continue
+
+                text = ob["text"]
+
+                if autojoin_paragraphs and isinstance(text, list):
+                    text = para_joiner.join(text)
+
+                if get_meta:
+                    yield text, (ob["meta"] if "meta" in ob else {})
+                else:
+                    yield text
+
+
+class TextArchive:
+    def __init__(self, file_path, mode="rb+"):
+        self.file_path = file_path
+        dir_name = os.path.dirname(file_path)
+        if dir_name:
+            os.makedirs(dir_name, exist_ok=True)
+
+        if not os.path.exists(file_path):
+            Path(file_path).touch()
+
+        self.fh = open(self.file_path, mode)
+
+    def add_data(self, data):
+        self.fh.write(data.encode("UTF-8") + b"\n")
+
+    def commit(self):
+        self.fh.flush()
+        self.fh.close()
+
+
+class TextReader:
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    # Optimized mmap read with infrequent tqdm updates to maintain speed
+    # Tested up to 250MB/s.
+    def read_tqdm(self, update_frequency=10000):
+        current_file_position = 0
+        line_counter = 0
+        with open(self.file_path, "r") as fh, tqdm.tqdm(
+            total=os.path.getsize(self.file_path),
+            dynamic_ncols=True,
+            unit="byte",
+            unit_scale=1,
+        ) as progress:
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    line_counter += 1
+                    if line_counter == update_frequency:
+                        new_file_pos = mmap_obj.tell()
+                        bytes_read = new_file_pos - current_file_position
+                        current_file_position = new_file_pos
+                        progress.update(bytes_read)
+                        line_counter = 0
+                    yield line[:-1]
+
+    def read_and_tell(self):
+        current_file_position = 0
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    new_file_pos = mmap_obj.tell()
+                    raw_bytes_read = new_file_pos - current_file_position
+                    current_file_position = new_file_pos
+                    yield line[:-1], raw_bytes_read
+
+    def read(self):
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            with mmap.mmap(fh.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
+                for line in iter(mmap_obj.readline, b""):
+                    line = line.decode("utf-8")
+                    yield line[:-1]
+
+    def read_slow(self):
+        with open(self.file_path, "r", encoding="utf8") as fh:
+            while True:
+                line = fh.readline()
+                if line == -1 or line == "":
+                    break
+                else:
+                    yield line[:-1]
+
+
+# Optimized for speed. Decompresses the archive in shell before
+# using the mmap'd TextReader.
+class ZStdTextReader:
+    def __init__(self, file):
+        self.file = file
+
+    def read_tqdm(self):
+        decompressed_file = self.file[:-4]
+        print("Decompressing file, please wait...")
+        os.system(f"zstd -d {self.file}")  # linux decompress is faster
+        reader = TextReader(decompressed_file)
+        yield from reader.read_tqdm()
+        os.remove(decompressed_file)
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce81446006babfa675a05067cc736eb495b602b0
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/decontaminate.py
@@ -0,0 +1,169 @@
+import time
+import random
+import pickle
+import json
+import glob
+import os
+import collections
+
+from .janitor import Janitor, word_ngrams
+from .archiver import ZStdTextReader
+
+
+# Was used for testing the evaluator decoupled from the full logic below
+def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
+    simulated_overlap = 0.1
+    contaminated = int(len(docs) * simulated_overlap)
+    return random.sample(range(len(docs)), contaminated)
+
+
+# Returns a dictionary containing all overlapping documents in each
+# task. In the standard use case, an overlap occurs when any of the 13-grams
+# found in the task document exist in the training set documents.
+#
+# To generate 13-grams for the pile see scripts/clean_training_data. The final output of these
+# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
+# files. These should exist in the "ngrams_path" provided to this function.
+
+# Algorithm:
+# 1. Build lookups for each dataset {ngram: list(document_ids)}
+# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
+# 3. Full scan the 13-grams from the training set against the merged lookup,
+#    saving matches in the "duplicates" dictionary {(task_name, task_set): set(doc_ids)}
+# 4. Strip the task_set from the dictionary keys and return
+#
+# We cache the task+set lookups as well as the overlaps.
+def get_train_overlap(docs_by_task_set, ngrams_path, limit):
+    # return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
+
+    info_dict_path = os.path.join(ngrams_path, "info.json")
+    info_dict = json.load(open(info_dict_path, "r"))
+    ngrams_n_size = info_dict["ngram_size"]
+
+    janitor = Janitor()
+
+    # Build lookup for each dataset first in case we use different task combinations later
+    print("Building Lookups...")
+    start = time.perf_counter()
+
+    def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
+        return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
+
+    lookups = {}
+    duplicates = {}  # (task_name, task_set): set(doc_ids)}
+    sets_to_decontaminate = len(docs_by_task_set.keys())
+
+    for (task_name, task_set), docs in docs_by_task_set.items():
+        if not os.path.exists(f"data/{task_name}"):
+            os.mkdir(f"data/{task_name}")
+
+        # Check if we've decontaminated this combination before
+        overlaps_dump_path = get_overlaps_dump_path(
+            task_name, task_set, ngrams_n_size, limit
+        )
+        if os.path.exists(overlaps_dump_path):
+            duplicates[(task_name, task_set)] = pickle.load(
+                open(overlaps_dump_path, "rb")
+            )
+            sets_to_decontaminate -= 1
+            continue
+        else:
+            duplicates[(task_name, task_set)] = set()
+
+        # Build/load the task lookup {ngram: set(documents)}.
+        task_set_lookup_path = (
+            f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.lookup"
+        )
+        if os.path.exists(task_set_lookup_path):
+            print(f"{task_set_lookup_path} available, loading...")
+            lookups[(task_name, task_set)] = pickle.load(
+                open(task_set_lookup_path, "rb")
+            )
+        else:
+            print(f"{task_set_lookup_path} not available, building...")
+            lookup = collections.defaultdict(set)
+
+            for doc_id, document in enumerate(docs):
+                ngrams = word_ngrams(janitor.normalize_string(document), ngrams_n_size)
+                for ngram in ngrams:
+                    lookup[ngram].add(doc_id)
+
+            pickle.dump(lookup, open(task_set_lookup_path, "wb"))
+            lookups[(task_name, task_set)] = lookup
+
+    elapsed = time.perf_counter() - start
+    print(f"Building lookups took {elapsed:0.5f} seconds.")
+
+    matched_ngrams = []
+
+    if sets_to_decontaminate > 0:
+        print("Merging lookups...")
+        start = time.perf_counter()
+        merged_lookup = collections.defaultdict(list)
+        for (task_name, task_set), lookup in lookups.items():
+            for ngram, doc_ids in lookup.items():
+                merged_lookup[ngram].append((task_name, task_set, doc_ids))
+
+        elapsed = time.perf_counter() - start
+        print(f"Merging lookups took {elapsed:0.5f} seconds.")
+
+        print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
+        files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
+        print(files)
+
+        for file in files:
+            start = time.perf_counter()
+            print(f"Scanning {file}")
+            reader = ZStdTextReader(file)
+            total_ngrams = 0
+            unique_ngrams = 0
+            matching_unique = 0
+            non_matching_unique = 0
+
+            current_ngram = ""
+            for line in reader.read_tqdm():  # Scan training set ngrams file
+                total_ngrams += 1
+                [ngram, document_id] = line.rsplit(" ", 1)
+                if (
+                    ngram != current_ngram
+                ):  # Only need to match the ngram once in training set
+                    unique_ngrams += 1
+                    current_ngram = ngram
+                    if ngram in merged_lookup:
+                        matched_ngrams.append(ngram)  # For logging
+                        matching_unique += 1
+                        for task_name, task_set, doc_ids in merged_lookup[ngram]:
+                            task_doc_set = duplicates[(task_name, task_set)]
+                            for (
+                                doc_id
+                            ) in (
+                                doc_ids
+                            ):  # Record contamination across all relevant task/set combos
+                                task_doc_set.add(doc_id)
+                        del merged_lookup[ngram]  # No point matching again
+                    else:
+                        non_matching_unique += 1
+
+            print(f"Total Ngrams: {total_ngrams}")
+            print(f"Unique Ngrams: {unique_ngrams}")
+            print(f"Unique Matching: {matching_unique}")
+            print(f"Unique Non Matching: {non_matching_unique}")
+            print("Matched ngrams:")
+            for ngram in matched_ngrams:
+                print(ngram)
+
+            elapsed = time.perf_counter() - start
+            print(f"Read took {elapsed:0.5f} seconds.")
+            print(f"Speed: {(os.path.getsize(file)/1000000.0)/elapsed}MB/second")
+
+        print(duplicates)
+
+        # Dump overlaps separately
+        for (task_name, task_set), doc_ids in duplicates.items():
+            overlaps_dump_path = get_overlaps_dump_path(
+                task_name, task_set, ngrams_n_size, limit
+            )
+            pickle.dump(doc_ids, open(overlaps_dump_path, "wb"))
+
+    # Strip task set and return
+    return {task_name: doc_ids for (task_name, task_set), doc_ids in duplicates.items()}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..458ee223c279f961fa1dd9a934fe65f74b7199c0
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/decontamination/janitor.py
@@ -0,0 +1,325 @@
+import re
+import string
+import timeit
+import pickle
+import traceback
+from pprint import pprint
+
+# This is a cpp module. Compile janitor_util.cpp with:
+# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
+try:
+    import janitor_util
+
+    JANITOR_CPP = True
+except Exception:
+    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
+    traceback.print_exc()
+    JANITOR_CPP = False
+
+
+# Implementation from nltk source
+# https://www.nltk.org/_modules/nltk/util.html
+def form_ngrams(sequence, n):
+    history = []
+    while n > 1:
+        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
+        try:
+            next_item = next(sequence)
+        except StopIteration:
+            # no more data, terminate the generator
+            return
+        history.append(next_item)
+        n -= 1
+    for item in sequence:
+        history.append(item)
+        yield tuple(history)
+        del history[0]
+
+
+def word_ngrams(s, n):
+    """Splits a string into ngram words"""
+    tokens = s.split()  # not a generator :(
+    ngram_seqs = form_ngrams(iter(tokens), n)
+    return (" ".join(ngram) for ngram in ngram_seqs)
+
+
+# Does character sequences only - combined faster function to play around with later
+# def word_ngrams_indices_combined(sequence, n):
+#     current_word = ""
+#     history = []
+#     gap = False;
+#     start = 0
+#     end = 0
+#     for character in sequence:
+#         if character == " ":
+#             if not gap:
+#                 gap = True
+#                 history.append(current_word)
+#                 end += len(current_word) - 1
+#                 current_word = ""
+#                 if len(history) == n:
+#                     yield (tuple(history), start, end)
+#                     del history[0]
+#                     start = end + 1
+#                     end = start
+#         else:
+#             gap = False
+#             current_word += character
+
+
+# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
+def split_indices(s):
+    """Splits a string on whitespaces and records the indices of each in the original string.
+    @:return generator((word, (start_idx, end_idx)), ...)
+    """
+    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
+
+
+def word_ngrams_indices(s, n):
+    """Splits a string into pairs of (ngram words, their start/end indices)"""
+    tokens_with_indices = split_indices(s)
+
+    # Generator of ngrams of (word, idx_pairs)
+    # (
+    #   [(word, (start,end)), (word, (start, end))...],
+    #   [(word, (start, end)), ...],
+    #   ...
+    # )
+    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)
+
+    # Generator of pairs of word and index ngrams
+    # (
+    #   ([word, word, ...], [(start,end), (start,end), ...]),
+    #   ...
+    # )
+    ngram_indices_pairs = (
+        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
+    )
+
+    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
+    return (
+        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
+        for ngram_seq, indices in ngram_indices_pairs
+    )
+
+
+class Janitor:
+
+    # FIXME delete_chars: Should anything else go here? Special chars?
+    def __init__(
+        self,
+        ngram_n=13,
+        window_to_remove=200,
+        too_dirty_cutoff=10,
+        minimum_slice_length=200,
+        delete_chars=string.punctuation,
+    ):
+        self.ngram_n = ngram_n
+        self.window_to_remove = window_to_remove
+        self.too_dirty_cutoff = too_dirty_cutoff
+        self.minimum_slice_length = minimum_slice_length
+        self.delete_chars = delete_chars
+
+        self.dirt_ngrams = set()
+
+        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
+        # This is fast by python standards
+        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
+        self.translation_table = str.maketrans(
+            string.ascii_lowercase + string.ascii_uppercase,  # These characters
+            string.ascii_lowercase * 2,  # Become these characters
+            self.delete_chars,  # These are deleted
+        )
+
+    ##############
+    # I/O for saving contamination ngrams
+    ##############
+
+    def save_contamination_ngrams(self, filename):
+        with open(filename, "wb") as fp:
+            pickle.dump(filename, fp)
+
+    def load_contamination_ngrams(self, filename):
+        with open(filename, "rb") as fp:
+            self.dirt_ngrams = pickle.load(fp)
+
+    ##############
+    # Call these :)
+    ##############
+
+    def register_contaminant(self, dirt_string):
+        """Register a string as contamination to be removed, e.g. a test set
+        This breaks the dirt_string into ngrams to store for future cleaning"""
+        if JANITOR_CPP:
+            return self.register_contaminant_cpp(dirt_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.register_contaminant_python(dirt_string)
+
+    def clean(self, dirty_string):
+        """Clean a string (e.g. a training set) by removing all ngrams previously
+        registered as contaminants. Returns a list of clean chunks, or empty if
+        the string was too dirty"""
+        if JANITOR_CPP:
+            return self.clean_cpp(dirty_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.clean_python(dirty_string)
+
+    def _split_chunks(self, dirty_string, dirty_parts):
+        clean_chunks = []
+        splice_idx = 0
+        end = -1
+        for i, (ngram, start, end) in enumerate(dirty_parts):
+            if i >= self.too_dirty_cutoff:
+                return []
+            start = max(0, start - self.window_to_remove)
+            end = min(len(dirty_string), end + self.window_to_remove)
+
+            if start - splice_idx > self.minimum_slice_length:
+                clean_chunks.append(dirty_string[splice_idx:start])
+            splice_idx = end
+
+        if end < len(dirty_string) - self.minimum_slice_length:
+            clean_chunks.append(dirty_string[end + 1 :])
+
+        return clean_chunks
+
+    ##############
+    # Fast C++
+    ##############
+
+    def register_contaminant_cpp(self, dirt_string):
+        self.dirt_ngrams.update(
+            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
+        )
+
+    def clean_cpp(self, dirty_string):
+        contamination_indices = janitor_util.clean_ngram_with_indices(
+            dirty_string, self.delete_chars, self.ngram_n
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+
+    ##############
+    # Slow python
+    ##############
+
+    def normalize_string(self, s):
+        return s.translate(self.translation_table)
+
+    def register_contaminant_python(self, dirt_string):
+        self.dirt_ngrams.update(
+            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
+        )
+
+    def clean_python(self, dirty_string):
+        contamination_indices = (
+            (None, *idx_pair)
+            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
+            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+
+
+##################################################################
+# Tests
+#################################################################
+
+# def print_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+
+#     for i in range(1, 10, 2):
+#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
+#         for ngram, start, end in \
+#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
+#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
+
+
+# def test_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+
+#     jan_python = Janitor()
+#     jan_cpp = Janitor()
+
+#     jan_python.register_contaminant_python(contaminant)
+#     jan_cpp.register_contaminant(contaminant)
+
+#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
+
+#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
+#         (jan_python.clean_python(source), jan_cpp.clean(source))
+
+#     print("Passed test, python==cpp")
+
+
+# def benchmark():
+#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
+#     setup = \
+#         """
+#         with open("data/enwik8", "r") as f:
+#             data = f.read()
+#         jan = Janitor(too_dirty_cutoff=1000)
+#         jan.register_contaminant('''
+#         theories is that there is a connection between &quot;geekdom&quot; and autism.
+#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
+#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
+#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
+#         the media's application of mental disease labels to what is actually variant normal behavior
+#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
+#         interests, even when they seem unusual to others, are not in themselves signs of autism or
+#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
+#         mental disease labels to children who in the past would have simply been accepted as a little
+#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
+#         Due to the recent publicity surrounding autism and autis
+#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
+#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
+#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
+#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
+#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
+#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
+#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
+#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
+#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
+#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
+#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
+#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
+#         ''')
+#         """
+
+#     n = 1
+#     print(f"Timing {n} run on 100 MB")
+#     print("Register contaminant")
+#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
+
+#     print("Clean")
+#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
+
+
+# def test_janitor_general():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+
+#     jan = Janitor(ngram_n=3)
+#     jan.register_contaminant(contaminant)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+
+#     filename = "data/saved_contam"
+#     jan.save_contamination_ngrams(filename)
+
+#     jan = Janitor(ngram_n=3)
+#     jan.load_contamination_ngrams(filename)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+
+
+# if __name__ == "__main__":
+#     test()
+#     # print_cpp()
+#     # test_cpp()
+#     # benchmark()
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63f92c46008d3f587e9970984a8c79974115aeed
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__init__.py
@@ -0,0 +1,20 @@
+from . import gpt2
+from . import gpt3
+from . import huggingface
+from . import textsynth
+from . import dummy
+
+MODEL_REGISTRY = {
+    "hf": gpt2.HFLM,
+    "hf-causal": gpt2.HFLM,
+    "hf-causal-experimental": huggingface.AutoCausalLM,
+    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
+    "gpt2": gpt2.GPT2LM,
+    "gpt3": gpt3.GPT3LM,
+    "textsynth": textsynth.TextSynthLM,
+    "dummy": dummy.DummyLM,
+}
+
+
+def get_model(model_name):
+    return MODEL_REGISTRY[model_name]
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..441a94ebc07e1161d3b390ea0b760817ba10b889
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec1863dd0bb00f0f2603b284a1d3cb2159d9eb21
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/dummy.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38b82f97121e04e2288f52b1aa52e0f78b540ccd
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt2.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79eb6493b67441a0c3252662d2ff1090566eec11
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/gpt3.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9eaf73ef7b3b5ad7cf09b6da84302dad1a258651
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/huggingface.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2181e839b93759521e66a6e1ed695f7691fe122
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/__pycache__/textsynth.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe737c314459124ea7f05d86ea4309223ffef3a2
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/dummy.py
@@ -0,0 +1,36 @@
+import random
+from lm_eval.base import LM
+
+
+class DummyLM(LM):
+    def __init__(self):
+        pass
+
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        return cls()
+
+    def loglikelihood(self, requests):
+        res = []
+
+        for _ in requests:
+            res.append((-random.random(), False))
+
+        return res
+
+    def greedy_until(self, requests):
+        res = []
+
+        for ctx, _ in requests:
+            res.append("lol")
+            assert ctx.strip() != ""
+
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        res = []
+
+        for _ in requests:
+            res.append(-random.random())
+
+        return res
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c69b6142553d08438b88764b28e1a111ecc74a2a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt2.py
@@ -0,0 +1,133 @@
+import torch
+import transformers
+from typing import Optional, Union
+from lm_eval.base import BaseLM
+
+
+class HFLM(BaseLM):
+    def __init__(
+        self,
+        device="cuda",
+        pretrained="gpt2",
+        revision="main",
+        low_cpu_mem_usage=None,
+        torch_dtype=None,
+        device_map=None,
+        subfolder=None,
+        tokenizer=None,
+        batch_size=1,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+        use_fast: Optional[bool] = True,
+    ):
+        super().__init__()
+
+        assert isinstance(device, str)
+        assert isinstance(pretrained, str)
+        assert isinstance(batch_size, int)
+
+        if device:
+            if device not in ["cuda", "cpu"]:
+                device = int(device)
+            self._device = torch.device(device)
+            print(f"Using device '{device}'")
+        else:
+            print("Device not specified")
+            print(f"Cuda Available? {torch.cuda.is_available()}")
+            self._device = (
+                torch.device("cuda")
+                if torch.cuda.is_available()
+                else torch.device("cpu")
+            )
+
+        # TODO: update this to be less of a hack once subfolder is fixed in HF
+        revision = revision + ("/" + subfolder if subfolder is not None else "")
+
+        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
+            pretrained,
+            load_in_8bit=load_in_8bit,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        ).eval()
+        if not load_in_8bit:
+            try:
+                self.gpt2.to(self.device)
+            except:  # noqa: E722
+                print(
+                    "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                )
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast=use_fast,
+        )
+        self.vocab_size = self.tokenizer.vocab_size
+
+        # multithreading and batching
+        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
+
+        # TODO: fix multi-gpu
+        # gpus = torch.cuda.device_count()
+        # if gpus > 1:
+        #     self.gpt2 = nn.DataParallel(self.gpt2)
+
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        try:
+            return self.gpt2.config.n_ctx
+        except AttributeError:
+            # gptneoconfig doesn't have n_ctx apparently
+            return self.gpt2.config.max_position_embeddings
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # TODO: fix multi-gpu
+        return self.batch_size_per_gpu  # * gpus
+
+    @property
+    def device(self):
+        # TODO: fix multi-gpu
+        return self._device
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        with torch.no_grad():
+            return self.gpt2(inps)[0]
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.gpt2.generate(
+            context,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            pad_token_id=eos_token_id,
+            do_sample=False,
+        )
+
+
+# for backwards compatibility
+GPT2LM = HFLM
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py
new file mode 100644
index 0000000000000000000000000000000000000000..55640ac28ac3539a171a83b4e4722363820c3a07
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/gpt3.py
@@ -0,0 +1,233 @@
+import os
+import numpy as np
+import transformers
+from lm_eval.base import BaseLM
+from lm_eval import utils
+from tqdm import tqdm
+import time
+
+
+def get_result(response, ctxlen):
+    """Process results from OpenAI API response.
+
+    :param response: dict
+        OpenAI API Response
+    :param ctxlen: int
+        Length of context (so we can slice them away and only keep the predictions)
+    :return:
+        continuation_logprobs: np.array
+            Log probabilities of continuation tokens
+        is_greedy: bool
+            whether argmax matches given continuation exactly
+    """
+    is_greedy = True
+    logprobs = response["logprobs"]["token_logprobs"]
+    continuation_logprobs = sum(logprobs[ctxlen:])
+
+    for i in range(ctxlen, len(response["logprobs"]["tokens"])):
+        token = response["logprobs"]["tokens"][i]
+        top_tokens = response["logprobs"]["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+
+    return continuation_logprobs, is_greedy
+
+
+def oa_completion(**kwargs):
+    """Query OpenAI API for completion.
+
+    Retry with back-off until they respond
+    """
+    import openai
+
+    backoff_time = 3
+    while True:
+        try:
+            return openai.Completion.create(**kwargs)
+        except openai.error.OpenAIError:
+            import traceback
+
+            traceback.print_exc()
+            time.sleep(backoff_time)
+            backoff_time *= 1.5
+
+
+class GPT3LM(BaseLM):
+    REQ_CHUNK_SIZE = 20
+
+    def __init__(self, engine, truncate=False):
+        """
+
+        :param engine: str
+            OpenAI API engine (e.g. davinci)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+
+        import openai
+
+        self.engine = engine
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
+
+        self.vocab_size = self.tokenizer.vocab_size
+
+        # to make the annoying "Using pad_token, but it is not set yet." error go away
+        self.tokenizer.pad_token = "<|endoftext|>"
+        assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373]
+        self.truncate = truncate
+        self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(
+            ["<|endoftext|>"]
+        )[0]
+
+        # Read from environment variable OPENAI_API_SECRET_KEY
+        openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
+
+    @property
+    def eot_token_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
+        return 2048
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        res = []
+
+        def _collate(x):
+            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
+            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
+            # we care about and so we need some kind of backup for when it isn't
+            toks = x[1] + x[2]
+            return -len(toks), tuple(toks)
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        for chunk in tqdm(
+            list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
+            disable=disable_tqdm,
+        ):
+            inps = []
+            ctxlens = []
+            for cache_key, context_enc, continuation_enc in chunk:
+                # max_length+1 because the API takes up to 2049 tokens, including the first context token
+                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
+                # TODO: the logic is much simpler if we just look at the length of continuation tokens
+                ctxlen = len(context_enc) - max(
+                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
+                )
+
+                inps.append(inp)
+                ctxlens.append(ctxlen)
+
+            response = oa_completion(
+                engine=self.engine,
+                prompt=inps,
+                echo=True,
+                max_tokens=0,
+                temperature=0.0,
+                logprobs=10,
+            )
+
+            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
+                response.choices, ctxlens, chunk
+            ):
+                answer = get_result(resp, ctxlen)
+
+                res.append(answer)
+
+                # partial caching
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+
+        return re_ord.get_original(res)
+
+    def greedy_until(self, requests):
+        if not requests:
+            return []
+        res = []
+
+        def _collate(x):
+            toks = self.tok_encode(x[0])
+            return len(toks), x[0]
+
+        re_ord = utils.Reorderer(requests, _collate)
+
+        def sameuntil_chunks(xs, size):
+            ret = []
+            lastuntil = xs[0][1]
+            for x in xs:
+                if len(ret) >= size or x[1] != lastuntil:
+                    yield ret, lastuntil
+                    ret = []
+                    lastuntil = x[1]
+                ret.append(x)
+
+            if ret:
+                yield ret, lastuntil
+
+        # todo: more intelligent batching for heterogeneous `until`
+        for chunk, until in tqdm(
+            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
+        ):
+            inps = []
+            for request in chunk:
+                context = request[0]
+                context_enc = self.tok_encode(context)
+                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
+                inps.append(inp)
+
+            response = oa_completion(
+                engine=self.engine,
+                prompt=inps,
+                max_tokens=self.max_gen_toks,
+                temperature=0.0,
+                logprobs=10,
+                stop=until,
+            )
+
+            for resp, request in zip(response.choices, chunk):
+                context = request[0]
+                until_ = request[1]
+                s = resp["text"]
+
+                for term in until_:
+                    s = s.split(term)[0]
+
+                # partial caching
+                self.cache_hook.add_partial("greedy_until", (context, until_), s)
+
+                res.append(s)
+
+        return re_ord.get_original(res)
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..caa91a7a00dd324988728f74cb1b6cfe54087723
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/huggingface.py
@@ -0,0 +1,740 @@
+import math
+import torch
+import torch.nn.functional as F
+import transformers
+import peft
+from pathlib import Path
+from typing import List, Mapping, NewType, Optional, Tuple, Union
+from tqdm import tqdm
+
+from transformers import BatchEncoding
+
+from lm_eval import utils
+from lm_eval.base import BaseLM
+
+TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
+
+_DeviceMapping = NewType("DeviceMapping", Mapping[str, Union[int, str, torch.device]])
+
+
+def _get_accelerate_args(
+    device_map_option: Optional[str] = "auto",
+    max_memory_per_gpu: Optional[Union[int, str]] = None,
+    max_cpu_memory: Optional[Union[int, str]] = None,
+    offload_folder: Optional[str] = "./offload",
+) -> dict:
+    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+    max_memory = {}
+    if max_memory_per_gpu is not None:
+        max_memory_per_gpu_map = {
+            device_idx: max_memory_per_gpu
+            for device_idx in range(torch.cuda.device_count())
+        }
+        max_memory.update(max_memory_per_gpu_map)
+    if max_cpu_memory is not None:
+        max_memory["cpu"] = max_cpu_memory
+
+    args = {}
+    if max_memory:
+        args["max_memory"] = max_memory
+    args["device_map"] = device_map_option
+    args["offload_folder"] = offload_folder
+    return args
+
+
+def _get_dtype(
+    dtype: Union[str, torch.dtype], config: Optional[transformers.AutoConfig] = None
+) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible."""
+    if dtype is None and config is not None:
+        _torch_dtype = config.torch_dtype
+    elif isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+
+
+class HuggingFaceAutoLM(BaseLM):
+    AUTO_CONFIG_CLASS: transformers.AutoConfig = transformers.AutoConfig
+    AUTO_TOKENIZER_CLASS: transformers.AutoTokenizer = transformers.AutoTokenizer
+    AUTO_MODEL_CLASS: transformers.AutoModel = None
+    AUTO_PEFT_CLASS: peft.PeftModel = None
+
+    # Default max sequence length setting for when no `max_length` is provided
+    # or no max length config setting is found in the model or tokenizer.
+    _DEFAULT_MAX_LENGTH: int = 2048
+
+    def __init__(
+        self,
+        pretrained: str,
+        quantized: Optional[Union[bool, str]] = None,
+        tokenizer: Optional[str] = None,
+        subfolder: Optional[str] = None,
+        revision: Optional[str] = "main",
+        batch_size: Optional[int] = 1,
+        max_gen_toks: Optional[int] = 256,
+        max_length: Optional[int] = None,
+        add_special_tokens: Optional[bool] = None,
+        use_accelerate: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        dtype: Optional[Union[str, torch.dtype]] = None,
+        device: Optional[Union[int, str]] = "cuda",
+        peft: str = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+        use_fast: Optional[bool] = True,
+        gptq_use_triton: Optional[bool] = False,
+    ):
+        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
+        Args:
+            pretrained (str):
+                The HuggingFace Hub model ID name or the path to a pre-trained
+                model to load. This is effectively the `pretrained_model_name_or_path`
+                argument of `from_pretrained` in the HuggingFace `transformers` API.
+            quantized (str or True, optional, defaults to None):
+                File name of a GPTQ quantized model to load. Set to `True` to use the
+                default name of the quantized model.
+            add_special_tokens (bool, optional, defaults to True):
+                Whether to add special tokens to the input sequences. If `None`, the
+                default value will be set to `True` for seq2seq models (e.g. T5) and
+                `False` for causal models.
+                WARNING: Evaluating causal models with `add_special_tokens=True` is
+                currently __not__ supported.
+            > Large model loading `accelerate` arguments
+            use_accelerate (bool, optional, defaults to False):
+                If True, uses the `accelerate` library to load a large model across
+                multiple devices.
+            device_map_option (str, optional, defaults to "auto"):
+                The device map option to use when loading the model with
+                `accelerate`.
+                Options:
+                    "auto", "balanced", "balanced_low_0", "sequential"
+                See the `accelerate` docs for more details on these options:
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
+            max_memory_per_gpu (Union[int, str], optional, defaults to None):
+                The maximum memory available for each GPU in bytes as `int` or in
+                the format f"{significand}{unit_symbol}" where {unit_symbol} is
+                any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
+                the "Parameters for big model inference" section of the following
+                docs:
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
+            max_cpu_memory (Union[int, str], optional, defaults to None):
+                The maximum available CPU RAM in bytes as `int` or in the format
+                f"{significand}{unit_symbol}" where {unit_symbol} is any of
+                ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
+                "Parameters for big model inference" section of the following docs:
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
+            offload_folder (str, optional, defaults to "./offload"):
+                The folder to offload weights into if `device_map` contains any
+                "disk" value.
+            dtype (Union[str, torch.dtype], optional, defaults to None):):
+                Converts the model weights to `dtype`, if specified. Strings get
+                converted to `torch.dtype` objects (e.g. `float16` -> `torch.float16`).
+                Use `dtype="auto"` to derive the type from the model’s weights.
+            peft (str, optional, defaults to None):
+                Path of the adapter weights to load from Huggingface. This will usually
+                include a directory that includes the files `adapter_config.json` and
+                `adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
+            load_in_8bit (bool, optional, defaults to False):
+                If True, will convert the loaded model into mixed-8bit quantized model. See:
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
+            trust_remote_code (bool, optional, defaults to False):
+                If True, will trust the remote code when loading the model.
+            use_fast (bool, optional, defaults to True):
+                If True, will use the fast tokenizer when loading the model.
+            gptq_use_triton (bool, optional, defaults to False):
+                Use Triton for GPTQ inference.
+        """
+        super().__init__()
+
+        assert isinstance(pretrained, str)
+        assert isinstance(device, str)
+        assert isinstance(batch_size, int)
+        if (
+            add_special_tokens is not None
+            and self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM
+        ):
+            # TODO: Support evaluating causal models with special tokens. Currently,
+            # this is not possible because the `_loglikelihood_tokens()` method for
+            # causal LMs makes a no-special-tokens assumption given that contexts
+            # and labels/continuations are tokenized separately without special
+            # tokens, concatenated, and then processed as inputs.
+            assert (
+                not add_special_tokens
+            ), "Evaluating causal models with `add_special_tokens=True` is currently not supported."
+
+        self._batch_size = batch_size  # TODO: Adaptive batch size
+        self._max_gen_toks = max_gen_toks
+        self._max_length = max_length
+        self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
+            pretrained,
+            trust_remote_code=trust_remote_code,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+        )
+
+        self._add_special_tokens = add_special_tokens
+        self.tokenizer = self._create_auto_tokenizer(
+            pretrained=pretrained,
+            revision=revision,
+            subfolder=subfolder,
+            tokenizer=tokenizer,
+            use_fast=use_fast,
+        )
+        self.tokenizer.model_max_length = self.max_length
+
+        model_kwargs = {}
+        if use_accelerate:
+            model_kwargs = _get_accelerate_args(
+                device_map_option,
+                max_memory_per_gpu,
+                max_cpu_memory,
+                offload_folder,
+            )
+        model_kwargs["load_in_8bit"] = load_in_8bit
+        self.model = self._create_auto_model(
+            pretrained=pretrained,
+            quantized=quantized,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            subfolder=subfolder,
+            torch_dtype=_get_dtype(dtype, self._config),
+            gptq_use_triton=gptq_use_triton,
+            **model_kwargs,
+        )
+        # note: peft_path can be different than pretrained model path
+        if peft is not None:
+            self.model = self._create_auto_model_peft(
+                model=self.model,
+                peft=peft,
+                revision=revision,
+                subfolder=subfolder,
+                torch_dtype=_get_dtype(dtype, self._config),
+                **model_kwargs,
+            )
+        self.model.eval()
+        torch.set_grad_enabled(False)
+
+        self._device = device
+        if use_accelerate and "lm_head" in self.model.hf_device_map:
+            # `accelerate` can place `lm_head` weights on a different device than
+            # the user specified one so we force `self._device` to be the same as
+            # `lm_head`'s.
+            self._device = self.model.hf_device_map["lm_head"]
+        if not use_accelerate and not load_in_8bit:
+            try:
+                self.model.to(self._device)
+            except:  # noqa: E722
+                print(
+                    "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
+                )
+
+    def _create_auto_model(
+        self,
+        *,
+        pretrained: str,
+        quantized: Optional[Union[bool, str]] = None,
+        revision: str,
+        subfolder: str,
+        device_map: Optional[Union[str, _DeviceMapping]] = None,
+        max_memory: Optional[dict] = None,
+        offload_folder: Optional[str] = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+        torch_dtype: Optional[Union[str, torch.dtype]] = None,
+        gptq_use_triton: Optional[bool] = False,
+    ) -> transformers.AutoModel:
+        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
+        if quantized is None:
+            model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision + ("/" + subfolder if subfolder is not None else ""),
+                device_map=device_map,
+                max_memory=max_memory,
+                offload_folder=offload_folder,
+                load_in_8bit=load_in_8bit,
+                trust_remote_code=trust_remote_code,
+                torch_dtype=torch_dtype,
+            )
+        else:
+            from auto_gptq import AutoGPTQForCausalLM
+
+            model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                model_basename=None if quantized is True else Path(quantized).stem,
+                device_map=device_map,
+                max_memory=max_memory,
+                trust_remote_code=trust_remote_code,
+                use_safetensors=True
+                if quantized is True
+                else quantized.endswith(".safetensors"),
+                use_triton=gptq_use_triton,
+                warmup_triton=gptq_use_triton,
+            )
+        return model
+
+    def _create_auto_model_peft(
+        self,
+        *,
+        model: transformers.PreTrainedModel,
+        peft: str,
+        revision: str,
+        subfolder: str,
+        device_map: Optional[Union[str, _DeviceMapping]] = None,
+        max_memory: Optional[dict] = None,
+        offload_folder: Optional[str] = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
+        torch_dtype: Optional[Union[str, torch.dtype]] = None,
+    ):
+        model = self.AUTO_PEFT_CLASS.from_pretrained(
+            model,
+            peft,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=trust_remote_code,
+            torch_dtype=torch_dtype,
+        )
+        return model
+
+    def _create_auto_tokenizer(
+        self,
+        *,
+        pretrained: str,
+        revision: str,
+        subfolder: str,
+        tokenizer: Optional[str] = None,
+        use_fast: Optional[bool] = True,
+    ) -> transformers.PreTrainedTokenizer:
+        """Returns a pre-trained tokenizer from a pre-trained tokenizer configuration."""
+        tokenizer = self.AUTO_TOKENIZER_CLASS.from_pretrained(
+            pretrained if tokenizer is None else tokenizer,
+            revision=revision + ("/" + subfolder if subfolder is not None else ""),
+            use_fast=use_fast,
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+
+    @property
+    def add_special_tokens(self) -> bool:
+        """Whether to include special tokens in encoded text. This should be
+        determined by whether or not the model was trained with special tokens.
+        TODO: Remove these conditionals once HuggingFace supports a way to
+        check whether or not an arbitrary model was trained with special tokens.
+        """
+        if self._add_special_tokens is not None:
+            return self._add_special_tokens
+        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
+            return False
+        elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
+            return True
+        else:
+            raise ValueError(
+                "Could not determine `add_special_tokens` value from the model "
+                "class. Set to `True` or `False` depending on whether the model "
+                "was pre-trained with special tokens."
+            )
+
+    @property
+    def eot_token(self) -> str:
+        return self.tokenizer.eos_token
+
+    @property
+    def eot_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_gen_toks(self) -> int:
+        return self._max_gen_toks
+
+    @property
+    def max_length(self) -> int:
+        """Return the maximum sequence length of the model.
+        NOTE: Different model configurations have different max sequence length
+        attribute names.
+            - n_positions: (CTRLConfig)
+            - max_position_embeddings: (BartConfig, RoFormerConfig)
+            - n_ctx: (GPT2Config)
+        NOTE: For relative position encoded models you should specify the max
+        sequence length of the model in the constructor via `max_length`.
+        """
+        if self._max_length is not None:
+            return self._max_length
+        # Try to get the sequence length from the model config.
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self._config, attr):
+                return getattr(self._config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    @property
+    def batch_size(self) -> int:
+        # TODO: Add adaptive batch size.
+        return self._batch_size  # * gpus
+
+    @property
+    def device(self) -> Union[int, str, torch.device]:
+        return self._device
+
+    def tok_encode(self, string: str) -> TokenSequence:
+        # TODO: Merge `tok_encode_batch` here.
+        return self.tokenizer.encode(string, add_special_tokens=self.add_special_tokens)
+
+    def tok_encode_batch(self, strings: List[str]) -> TokenSequence:
+        return self.tokenizer(
+            strings,
+            padding=True,
+            add_special_tokens=self.add_special_tokens,
+            return_tensors="pt",
+        )
+
+    def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
+        return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
+
+    def greedy_until(
+        self, requests: List[Tuple[str, Union[List[str], str]]]
+    ) -> List[str]:
+        def _collate(x):
+            tokens = self.tok_encode(x[0])
+            return len(tokens), x[0]
+
+        results = []
+        reorder = utils.Reorderer(requests, _collate)
+        for chunk in utils.chunks(
+            tqdm(reorder.get_reordered(), disable=False), self.batch_size
+        ):
+            context = [c[0] for c in chunk]
+            request_args = chunk[0][1]
+            stop_sequences = (
+                request_args if isinstance(request_args, list) else [request_args]
+            )  # request_args["stop_sequences"]
+            max_generation_length = (
+                self._max_gen_toks
+            )  # request_args["max_generation_length"]
+
+            assert (
+                isinstance(max_generation_length, int) or max_generation_length is None
+            )
+            assert isinstance(stop_sequences, list) or stop_sequences is None
+
+            # TODO: Find a better way to handle stop sequences for 0-shot.
+            if stop_sequences is None:
+                until = [self.eot_token]
+            else:
+                until = stop_sequences + [self.eot_token]
+
+            if max_generation_length is None:
+                max_tokens = self.max_gen_toks
+            else:
+                max_tokens = max_generation_length
+
+            token_context = self.tok_encode_batch(context)
+
+            responses = self._model_generate(
+                inputs=token_context,
+                max_tokens=max_tokens,
+                stop=until,
+            )
+            responses = self.tok_decode(responses.tolist())
+
+            for response in responses:
+                # Ensure the generated responses do not contain the stop sequences.
+                for term in until:
+                    response = response.split(term)[0]
+                # partial caching
+                self.cache_hook.add_partial("greedy_until", (context, until), response)
+                results.append(response)
+        return reorder.get_original(results)
+
+
+class AutoCausalLM(HuggingFaceAutoLM):
+    """Causal language modeling.
+    You can find a set of supported models in the HF documentation:
+    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForCausalLM
+    """
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+    AUTO_PEFT_CLASS = peft.PeftModel
+
+    def _create_auto_tokenizer(
+        self,
+        *,
+        pretrained: str,
+        revision: str,
+        subfolder: str,
+        tokenizer: Optional[str] = None,
+        use_fast: Optional[bool] = True,
+    ) -> transformers.PreTrainedTokenizer:
+        tokenizer = super()._create_auto_tokenizer(
+            pretrained=pretrained,
+            revision=revision,
+            subfolder=subfolder,
+            tokenizer=tokenizer,
+            use_fast=use_fast,
+        )
+        tokenizer.padding_side = "left"
+        return tokenizer
+
+    def _model_call(
+        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
+    ) -> TokenSequence:
+        return self.model(inputs)["logits"]
+
+    def _model_generate(
+        self,
+        inputs: transformers.BatchEncoding,
+        max_tokens: int,
+        stop: Optional[List[str]] = None,
+    ) -> TokenSequence:
+        # Ensure that the context does not encroach into the `space`
+        # for the generation.
+        input_ids = inputs["input_ids"][:, self.max_gen_toks - self.max_length :]
+        attention_mask = inputs["attention_mask"][
+            :, self.max_gen_toks - self.max_length :
+        ]
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, input_ids.shape[1], input_ids.shape[0]
+        )
+
+        generations = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            # GPT style models require the `generate` `max_length` arg to include the
+            # context length, so we instead set `max_new_tokens` which is the number
+            # of new tokens to generate, excluding the current number of tokens.
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            do_sample=False,
+        )
+        return utils.select_continuation_from_batch_left_padding(
+            generations, max_context_size=inputs["input_ids"].size(1)
+        )
+
+
+class AutoSeq2SeqLM(HuggingFaceAutoLM):
+    """Seq2Seq language modeling.
+    You can find a set of supported models in the following documentation:
+    https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForSeq2SeqLM
+    """
+
+    AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+    AUTO_PEFT_CLASS = peft.PeftModel
+
+    @property
+    def max_length(self) -> int:
+        """Return the maximum sequence length of the model.
+        TODO: Currently only works for relative position encoded Seq2Seq models.
+        """
+        if self._max_length is not None:
+            return self._max_length
+        return self._DEFAULT_MAX_LENGTH
+
+    def loglikelihood(
+        self, requests: List[Tuple[str, str]]
+    ) -> List[Tuple[float, bool]]:
+        new_requests = []
+        for chunk in utils.chunks(requests, self.batch_size):
+            context, continuation = zip(*chunk)
+
+            # Fill empty contexts with the EOT token.
+            context = [
+                f"{self.eot_token}" if len(text) == 0 else text for text in context
+            ]
+            context_enc = self.tok_encode_batch(context)
+            for key in context_enc:
+                context_enc[key] = context_enc[key][:, -self.max_length :]
+
+            # Remove leading whitespace introduced by the default
+            # `text_target_separator` since the context and continuation
+            # will not be concatenated as a single (decoder) input.
+            continuation = [text.lstrip() for text in continuation]
+            continuation_enc = self.tok_encode_batch(list(continuation))
+            for key in continuation_enc:
+                continuation_enc[key] = continuation_enc[key][:, -self.max_length :]
+
+            new_requests.append(
+                ((context, continuation), context_enc, continuation_enc)
+            )
+        return self._loglikelihood_tokens(new_requests)
+
+    def loglikelihood_rolling(self, requests: List[Tuple[str, str]]) -> List[float]:
+        loglikelihoods = []
+        for (string,) in tqdm(requests):
+            rolling_token_windows = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.eot_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            contexts, conts = utils.split_and_pad_windows(
+                rolling_token_windows,
+                pad_token_id=self.eot_token_id,
+                max_seq_len=self.max_length,
+            )
+            # Manually create BatchEncoding tensors with attention masks as
+            # expected by `self._model_call` in `self._loglikelihood_tokens`.
+            contexts_enc = torch.Tensor(contexts).long()
+            contexts_enc = transformers.tokenization_utils_base.BatchEncoding(
+                {
+                    "input_ids": contexts_enc,
+                    "attention_mask": (contexts_enc != self.eot_token_id).long(),
+                }
+            )
+            conts_enc = torch.Tensor(conts).long()
+            conts_enc = transformers.tokenization_utils_base.BatchEncoding(
+                {
+                    "input_ids": conts_enc,
+                    "attention_mask": (conts_enc != self.eot_token_id).long(),
+                }
+            )
+            # TODO: Extract out this call so it only gets called once and also
+            # somehow figure out partial caching for.
+            rolling_token_windows_request = [
+                ((contexts, conts), contexts_enc, conts_enc)
+            ]
+            string_nll = self._loglikelihood_tokens(
+                rolling_token_windows_request, disable_tqdm=True
+            )
+            string_nll = [x[0] for x in string_nll]  # discard is_greedy
+            string_nll = sum(string_nll)
+            loglikelihoods.append(string_nll)
+        return loglikelihoods
+
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], TokenSequence, TokenSequence]],
+        disable_tqdm: Optional[bool] = False,
+    ) -> List[Tuple[float, bool]]:
+        results = []
+        for chunk in tqdm(
+            requests, total=math.ceil(len(requests)), disable=disable_tqdm
+        ):
+            cache_keys, inputs_tokens, targets_tokens = chunk
+            inputs_tokens = inputs_tokens.to(self.device)
+            targets_tokens = targets_tokens.to(self.device)
+            outputs = self._model_call(inputs=inputs_tokens, labels=targets_tokens)
+            log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
+
+            output_iterator = zip(
+                zip(cache_keys[0], cache_keys[1]),
+                log_softmaxes,
+                targets_tokens["input_ids"],
+                targets_tokens["attention_mask"],
+            )
+            for cache_key, log_softmax, target_tokens, target_mask in output_iterator:
+                length = target_mask.sum()
+                log_softmax = log_softmax[:length]
+                target_tokens = target_tokens[:length]
+                greedy_tokens = log_softmax.argmax(dim=-1)
+                max_equal = (greedy_tokens == target_tokens).all()
+                target_logits = torch.gather(
+                    log_softmax, 1, target_tokens.unsqueeze(-1)
+                ).squeeze(-1)
+                answer = (float(target_logits.sum()), bool(max_equal))
+                results.append(answer)
+                if cache_key is not None:
+                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+        return results
+
+    def _model_call(
+        self, inputs: TokenSequence, labels: Optional[TokenSequence] = None
+    ) -> TokenSequence:
+        return self.model(**inputs, labels=labels["input_ids"])
+
+    def _model_generate(
+        self,
+        inputs: transformers.BatchEncoding,
+        max_tokens: int,
+        stop: Optional[List[str]] = None,
+    ) -> TokenSequence:
+        input_ids = inputs["input_ids"][:, -self.max_length :].to(self.device)
+        attention_mask = inputs["attention_mask"][:, -self.max_length :].to(self.device)
+
+        # Generate one token to calculate the number of start tokens prepended to decoder_input_ids
+        # (leaving this here in case the below assumption is violated in the future)
+        # one_tok_gen = self.model.generate(
+        #    input_ids=torch.zeros((1, 1), dtype=torch.int),
+        #    min_length=2,
+        #    max_new_tokens=1,
+        # ).squeeze()
+        # initial_decoder_input_length = len(one_tok_gen) - 1
+
+        # Assume that there will always only be one token in the decoder inputs, assumption holds for existing HF models
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, 1, input_ids.shape[0]
+        )
+
+        generations = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            do_sample=False,
+        )
+        return generations
+
+
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ):
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        self.sequence_id_len = len(self.sequence_ids)
+        self.tokenizer = tokenizer
+
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :][
+            :, -self.sequence_id_len :
+        ]
+
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+
+
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc80cc9969583ad15928de4074e69a4ba4ea9539
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/models/textsynth.py
@@ -0,0 +1,155 @@
+""" TextSynth API
+Implementation provided by Fabrice Bellard:
+    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
+
+In order to use the API, you must have a valid TextSynth account and
+enough credits.
+
+Example usage:
+
+    python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
+
+Homepage: https://textsynth.com/index.html
+"""
+import logging
+import os
+import requests as _requests
+import time
+from tqdm import tqdm
+from lm_eval.base import BaseLM
+
+
+logger = logging.getLogger(__name__)
+
+
+def textsynth_completion(**kwargs):
+    """Query TextSynth API for completion.
+    Retry with back-off until they respond.
+    """
+    backoff_time = 3
+    while True:
+        try:
+            return _requests.post(**kwargs)
+        except _requests.exceptions.RequestException:
+            import traceback
+
+            traceback.print_exc()
+            time.sleep(backoff_time)
+            backoff_time *= 1.5
+
+
+class TextSynthLM(BaseLM):
+    def __init__(self, engine, truncate=False):
+        """
+        :param engine: str
+            TextSynth API engine (e.g. `gptj_6B`)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+
+        self.engine = engine
+        self.truncate = truncate
+        self.api_url = "https://api.textsynth.com"
+        # Read from environment variable TEXTSYNTH_API_SECRET_KEY
+        self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
+
+    @property
+    def eot_token_id(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    @property
+    def max_length(self):
+        # NOTE: Turn on truncation to avoid errors on long inputs.
+        return 2048
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def tok_decode(self, tokens):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests):
+        res = []
+        for context, continuation in tqdm(requests):
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={"context": context, "continuation": continuation},
+            )
+            resp = response.json()
+            if "logprob" in resp:
+                logprob = resp["logprob"]
+                is_greedy = resp["is_greedy"]
+                res.append((logprob, is_greedy))
+            else:
+                logger.error(
+                    f"The following response does not contain `logprobs`. Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: The TextSynth API does not support tokenized inputs so we cannot
+        # manually partition long contexts into smaller rolling windows as
+        # done for other models derived from `BaseLM`. Override this method
+        # with a windowing scheme that works for direct string inputs.
+        raise NotImplementedError(
+            "`loglikelihood_rolling` is currently not supported due to lack of "
+            "input tokenization support from TextSynth."
+        )
+
+    def greedy_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm(requests):
+            inp = request[0]
+            until = request[1]
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={
+                    "prompt": inp,
+                    "max_tokens": self.max_gen_toks,
+                    "top_k": 1,
+                    "stop": until,
+                },
+            )
+            resp = response.json()
+            if "text" in resp:
+                s = resp["text"]
+                res.append(s)
+            else:
+                logger.error(
+                    f"The following response does not contain generated `text`. "
+                    "Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0c9b16aabee5e2c366aea0d978fbb342a9ee4dc
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30001b12a59b16ff95c268a37aef518d228aa8a2
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/anli.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9457bf8b297e5410475ff758f8ec30a0f898e419
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arc.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b3e68270a96f21b3954109382937d802deedcd9
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/arithmetic.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d06c230d9b118bb8c9ef7dac8af023f547e4fcd7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/asdiv.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bae86c01f8da82f90bd4ee847c0f37b0bcad305a
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/blimp.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edc380ab18a8a97d1320a37db8a8803f86a38539
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/cbt.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a77764377214f72b90279fea25908e444ff6102f
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/coqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65ad1a119701f3952d853fe5d4c77c6ea2005a65
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/crowspairs.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d84daf8c2faef6c300209302755ebc4d893290f0
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/drop.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a560e5b90a3a6ec5b038aa990f960da1b5544780
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/gsm8k.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6edfd0eab0f2a33d61c2df220f398e3a0211e5da
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/headqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b000ee58e62d3ded646b13259a7e05d7381c51c
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hellaswag.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6aa89cb0d5a32fa197feb2acd16dd3a8e24a46a
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_math.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c27b87d87b348eba6cda243ac23eed9a87efbc1
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/hendrycks_test.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1d7136fb0b8b8a918fe578ed1fe0513e87338f5
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02215e3807397be1780b82a2c1fff6e7b49b07f8
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_cloze.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d97a2ceb3a230786e552b61aa72b4995afee9a4c
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/lambada_multilingual.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f122e1005a95490469ceb9f4076092bfc5f416ab
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/logiqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..157ca9c1caa5a42a2e1f8adb77497649f429a002
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mathqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb308a754c3663071e113715991b50f901ce9c89
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mc_taco.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c232e9606134332a7da2ff3749c44c374a5222f
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/mutual.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5834baad11e4e6e897bb5a92f5e8b130176aeed7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/naturalqs.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a63fe17da168733592e9f80746732403a6b1519
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/openbookqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b01e0ad08b0a0d97e1ba1d79ae99a4c4c2c4acc
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pile.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d170cfdd4457e36f99460189428cab4c89c0f4c
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/piqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a45a715d31a32794150ad95ac47a08f6b4ababb2
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/prost.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de7024a36676c3e3e51a5cad5c5200b0784ed3e4
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/pubmedqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02256d0acaf080d97d00e4aea8108434ca5469cb
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qa4mre.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d24ff3c2e3944ede5340997b207c5a912ee7ccdc
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/qasper.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b9b9b2b0965c51afcd0ccb17a89789cb056fae6
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/race.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80db4975f7fdbee55be6b211ed938ca64aa7b7f0
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sat.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85fd1fbc813fac61aad6a9266b21c71db5cbe952
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/sciq.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89d0a76977c581396622c3e36505be0d8072d282
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/squad.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a018ac9bb301c3579adf1cea58256bd31d86b2f
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/storycloze.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee97fe6b70c5c6a84e0e6d7bbc6f574cc1092df
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/swag.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d3326820193e08518dc3294ebbad60f5e5d51ea
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/toxigen.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7087dcea3370bd5aba29f5843f36e42c8993f3c
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/translation.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06c7c875e7942bc6da39b546cb3de5c8c0ca3673
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/triviaqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3377a2eff2a030e633b5ded11d1b658d0f41ac78
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/truthfulqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fe3970ef197900219c330f5dc35dbf7a2709c37
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/webqs.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e42ac9313a6bf84323cec6e4559f5e063de3acc
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wikitext.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..579e0e00035180b18c49ea5e85103bb405da5ec9
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/tasks/__pycache__/wsc273.cpython-310.pyc differ