diff --git a/scripts/yans/eval/lm-evaluation-harness/.github/workflows/pull_request.yml b/scripts/yans/eval/lm-evaluation-harness/.github/workflows/pull_request.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fa3838a5b90cdfd2a18d92c933d4fa28379024b1
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/.github/workflows/pull_request.yml
@@ -0,0 +1,13 @@
+name: Pull Request
+
+on: [pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - uses: pre-commit/action@v2.0.3
diff --git a/scripts/yans/eval/lm-evaluation-harness/.github/workflows/python-app.yml b/scripts/yans/eval/lm-evaluation-harness/.github/workflows/python-app.yml
new file mode 100644
index 0000000000000000000000000000000000000000..fb8e54d948ea9602f785eca64acce1079ce30289
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/.github/workflows/python-app.yml
@@ -0,0 +1,50 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Build
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Cache
+      uses: actions/cache@v2.1.3
+      with:
+        # A list of files, directories, and wildcard patterns to cache and restore
+        path: |
+          ~/.cache
+        # An explicit key for restoring and saving the cache
+        key: evaldata-cache-4
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest pytest-cov
+        pip install -e .[dev,multilingual]
+        # Install optional git dependencies
+        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest -vv --cov=lm_eval/ tests/
+    - name: Upload to codecov
+      run: |
+        bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c910c04276e4db204f63f0155ed0d6143bd6c5e
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2705b5b7d4f8099dfd18cb5ba9844bde0cf7fd3
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/asdiv.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/asdiv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c256b3b2d81920b7dd8f3de583863343f68def3
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/asdiv.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/asdiv.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/asdiv.py
new file mode 100644
index 0000000000000000000000000000000000000000..927de50c2b4db05db38e22487cdf8e1c87f6f1ee
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/asdiv.py
@@ -0,0 +1,111 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ASDIV dataset."""
+
+
+import os
+import xml.etree.ElementTree as ET
+
+import datasets
+
+
+_CITATION = """\
+@misc{miao2021diverse,
+    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
+    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
+    year={2021},
+    eprint={2106.15772},
+    archivePrefix={arXiv},
+    primaryClass={cs.AI}
+}
+"""
+
+_DESCRIPTION = """\
+ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
+patterns and problem types) English math word problem (MWP) corpus for evaluating
+the capability of various MWP solvers. Existing MWP corpora for studying AI progress
+remain limited either in language usage patterns or in problem types. We thus present
+a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
+types taught in elementary school. Each MWP is annotated with its problem type and grade
+level (for indicating the level of difficulty).
+"""
+
+_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
+
+
+class ASDiv(datasets.GeneratorBasedBuilder):
+    """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="asdiv",
+            version=VERSION,
+            description="A diverse corpus for evaluating and developing english math word problem solvers",
+        )
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "body": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "solution_type": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+                "formula": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+        base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, base_filepath, "dataset", "ASDiv.xml"
+                    ),
+                    "split": datasets.Split.VALIDATION,
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        tree = ET.parse(filepath)
+        root = tree.getroot()
+        for key, problem in enumerate(root.iter("Problem")):
+            yield key, {
+                "body": problem.find("Body").text,
+                "question": problem.find("Question").text,
+                "solution_type": problem.find("Solution-Type").text,
+                "answer": problem.find("Answer").text,
+                "formula": problem.find("Formula").text,
+            }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfeea0d389b1c0e0150b6b4d16a71ac9ce0dcfbb
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/dataset_infos.json
@@ -0,0 +1 @@
+{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a032b04fedd1b019d2ed3c9913dc9d75b25e3c9
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/coqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/coqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e363d48dbfa0040cf4f51a57297911f385df81fa
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/coqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/coqa.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/coqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f0983e17263225739416805602ae6b2725a1e3d
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/coqa.py
@@ -0,0 +1,245 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CoQA dataset.
+
+This `CoQA` adds the "additional_answers" feature that's missing in the original
+datasets version:
+https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
+"""
+
+
+import json
+
+import datasets
+
+
+_CITATION = """\
+@misc{reddy2018coqa,
+    title={CoQA: A Conversational Question Answering Challenge},
+    author={Siva Reddy and Danqi Chen and Christopher D. Manning},
+    year={2018},
+    eprint={1808.07042},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """\
+CoQA is a large-scale dataset for building Conversational Question Answering
+systems. The goal of the CoQA challenge is to measure the ability of machines to
+understand a text passage and answer a series of interconnected questions that
+appear in a conversation.
+"""
+
+_HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = {
+    "train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
+    "validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
+}
+
+# `additional_answers` are not available in the train set so we fill them with
+# empty dicts of the same form.
+_EMPTY_ADDITIONAL_ANSWER = {
+    "0": [
+        {
+            "span_start": -1,
+            "span_end": -1,
+            "span_text": "",
+            "input_text": "",
+            "turn_id": -1,
+        }
+    ],
+    "1": [
+        {
+            "span_start": -1,
+            "span_end": -1,
+            "span_text": "",
+            "input_text": "",
+            "turn_id": -1,
+        }
+    ],
+    "2": [
+        {
+            "span_start": -1,
+            "span_end": -1,
+            "span_text": "",
+            "input_text": "",
+            "turn_id": -1,
+        }
+    ],
+}
+
+
+class Coqa(datasets.GeneratorBasedBuilder):
+    """CoQA is a large-scale dataset for building Conversational Question Answering systems."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="coqa", version=VERSION, description="The CoQA dataset."
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "id": datasets.Value("string"),
+                "source": datasets.Value("string"),
+                "story": datasets.Value("string"),
+                "questions": datasets.features.Sequence(
+                    {
+                        "input_text": datasets.Value("string"),
+                        "turn_id": datasets.Value("int32"),
+                    }
+                ),
+                "answers": datasets.features.Sequence(
+                    {
+                        "span_start": datasets.Value("int32"),
+                        "span_end": datasets.Value("int32"),
+                        "span_text": datasets.Value("string"),
+                        "input_text": datasets.Value("string"),
+                        "turn_id": datasets.Value("int32"),
+                    }
+                ),
+                "additional_answers": {
+                    "0": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                    "1": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                    "2": datasets.features.Sequence(
+                        {
+                            "span_start": datasets.Value("int32"),
+                            "span_end": datasets.Value("int32"),
+                            "span_text": datasets.Value("string"),
+                            "input_text": datasets.Value("string"),
+                            "turn_id": datasets.Value("int32"),
+                        }
+                    ),
+                },
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
+        data_dirs = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dirs["train"],
+                    "split": datasets.Split.TRAIN,
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dirs["validation"],
+                    "split": datasets.Split.VALIDATION,
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)
+            for row in data["data"]:
+                id = row["id"]
+                source = row["source"]
+                story = row["story"]
+                questions = [
+                    {"input_text": q["input_text"], "turn_id": q["turn_id"]}
+                    for q in row["questions"]
+                ]
+                answers = [
+                    {
+                        "span_start": a["span_start"],
+                        "span_end": a["span_end"],
+                        "span_text": a["span_text"],
+                        "input_text": a["input_text"],
+                        "turn_id": a["turn_id"],
+                    }
+                    for a in row["answers"]
+                ]
+                if split == datasets.Split.TRAIN:
+                    additional_answers = _EMPTY_ADDITIONAL_ANSWER
+                else:
+                    additional_answers = {
+                        "0": [
+                            {
+                                "span_start": a0["span_start"],
+                                "span_end": a0["span_end"],
+                                "span_text": a0["span_text"],
+                                "input_text": a0["input_text"],
+                                "turn_id": a0["turn_id"],
+                            }
+                            for a0 in row["additional_answers"]["0"]
+                        ],
+                        "1": [
+                            {
+                                "span_start": a1["span_start"],
+                                "span_end": a1["span_end"],
+                                "span_text": a1["span_text"],
+                                "input_text": a1["input_text"],
+                                "turn_id": a1["turn_id"],
+                            }
+                            for a1 in row["additional_answers"]["1"]
+                        ],
+                        "2": [
+                            {
+                                "span_start": a2["span_start"],
+                                "span_end": a2["span_end"],
+                                "span_text": a2["span_text"],
+                                "input_text": a2["input_text"],
+                                "turn_id": a2["turn_id"],
+                            }
+                            for a2 in row["additional_answers"]["2"]
+                        ],
+                    }
+                yield row["id"], {
+                    "id": id,
+                    "story": story,
+                    "source": source,
+                    "questions": questions,
+                    "answers": answers,
+                    "additional_answers": additional_answers,
+                }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff8ab4a73199e65d8ce4f4e25e0a31f2e492cf22
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/dataset_infos.json
@@ -0,0 +1 @@
+{"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33eca11ce9e8aadaf6a62c10a617152045f1e8f7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/headqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/headqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40b52f5cb5e9eafff5decb0e98fd14e0a81999ed
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/headqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..47d6707dbd120aa42cdb37e4822594c5350d91fd
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/dataset_infos.json
@@ -0,0 +1 @@
+{"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/headqa.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/headqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..73be342b98253edfe2e119d5ee2d06d6d7d7af4e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/headqa.py
@@ -0,0 +1,162 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: This is an exact copy of
+# https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
+# with the exception of the `image` feature. This is to avoid adding `Pillow`
+# as a dependency.
+"""HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
+
+
+import json
+import os
+
+import datasets
+
+
+_CITATION = """\
+@inproceedings{vilares-gomez-rodriguez-2019-head,
+    title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
+    author = "Vilares, David  and
+      G{\'o}mez-Rodr{\'i}guez, Carlos",
+    booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
+    month = jul,
+    year = "2019",
+    address = "Florence, Italy",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/P19-1092",
+    doi = "10.18653/v1/P19-1092",
+    pages = "960--966",
+    abstract = "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.",
+}
+"""
+
+_DESCRIPTION = """\
+HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
+Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
+de Sanidad, Consumo y Bienestar Social.
+The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.
+"""
+
+_HOMEPAGE = "https://aghie.github.io/head-qa/"
+
+_LICENSE = "MIT License"
+
+_URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"
+
+_DIRS = {"es": "HEAD", "en": "HEAD_EN"}
+
+
+class HeadQA(datasets.GeneratorBasedBuilder):
+    """HEAD-QA: A Healthcare Dataset for Complex Reasoning"""
+
+    VERSION = datasets.Version("1.1.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="es", version=VERSION, description="Spanish HEAD dataset"
+        ),
+        datasets.BuilderConfig(
+            name="en", version=VERSION, description="English HEAD dataset"
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "es"
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "year": datasets.Value("string"),
+                    "category": datasets.Value("string"),
+                    "qid": datasets.Value("int32"),
+                    "qtext": datasets.Value("string"),
+                    "ra": datasets.Value("int32"),
+                    "answers": [
+                        {
+                            "aid": datasets.Value("int32"),
+                            "atext": datasets.Value("string"),
+                        }
+                    ],
+                }
+            ),
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        data_dir = dl_manager.download_and_extract(_URL)
+
+        dir = _DIRS[self.config.name]
+        data_lang_dir = os.path.join(data_dir, dir)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "data_dir": data_dir,
+                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, data_dir, filepath):
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as f:
+            head_qa = json.load(f)
+            for exam_id, exam in enumerate(head_qa["exams"]):
+                content = head_qa["exams"][exam]
+                name = content["name"].strip()
+                year = content["year"].strip()
+                category = content["category"].strip()
+                for question in content["data"]:
+                    qid = int(question["qid"].strip())
+                    qtext = question["qtext"].strip()
+                    ra = int(question["ra"].strip())
+
+                    aids = [answer["aid"] for answer in question["answers"]]
+                    atexts = [answer["atext"].strip() for answer in question["answers"]]
+                    answers = [
+                        {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
+                    ]
+
+                    id_ = f"{exam_id}_{qid}"
+                    yield id_, {
+                        "name": name,
+                        "year": year,
+                        "category": category,
+                        "qid": qid,
+                        "qtext": qtext,
+                        "ra": ra,
+                        "answers": answers,
+                    }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7579fb56d3a84bd8e1142ad45d49422b1004c7bf
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/hendrycks_ethics.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/hendrycks_ethics.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f4fe4b2ac0ce55faca635c6eab2da972fbbed94
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/hendrycks_ethics.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..54aecc3bed829a951c32ec612db33252097c6ba6
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/dataset_infos.json
@@ -0,0 +1 @@
+{"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n    title={Aligning AI With Shared Human Values},\n    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n    journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n    year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
new file mode 100644
index 0000000000000000000000000000000000000000..520f912e27e669db6fd71d7b88e68836645b83f4
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py
@@ -0,0 +1,229 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ETHICS dataset."""
+# TODO: Add the `hard` dataset splits.
+
+
+import csv
+import os
+
+import datasets
+
+
+_CITATION = """\
+@article{hendrycks2021ethics
+    title={Aligning AI With Shared Human Values},
+    author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
+    journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+    year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+The ETHICS dataset is a benchmark that spans concepts in justice, well-being,
+duties, virtues, and commonsense morality. Models predict widespread moral
+judgments about diverse text scenarios. This requires connecting physical and
+social world knowledge to value judgements, a capability that may enable us
+to steer chatbot outputs or eventually regularize open-ended reinforcement
+learning agents.
+"""
+
+_HOMEPAGE = "https://github.com/hendrycks/ethics"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar"
+
+
+class EthicsConfig(datasets.BuilderConfig):
+    """BuilderConfig for Hendrycks ETHICS."""
+
+    def __init__(self, prefix, features, **kwargs):
+        """BuilderConfig for Hendrycks ETHICS.
+
+        Args:
+        prefix: *string*, prefix to add to the dataset name for path location.
+        features: *list[string]*, list of the features that will appear in the
+            feature dict.
+        """
+        # Version history:
+        super().__init__(version=datasets.Version("0.0.1"), **kwargs)
+        self.prefix = prefix
+        self.features = features
+
+
+class HendrycksEthics(datasets.GeneratorBasedBuilder):
+    """The ETHICS dataset is a benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality."""
+
+    BUILDER_CONFIGS = [
+        EthicsConfig(
+            name="commonsense",
+            prefix="cm",
+            features=datasets.Features(
+                {
+                    "label": datasets.Value("int32"),
+                    "input": datasets.Value("string"),
+                    "is_short": datasets.Value("bool"),
+                    "edited": datasets.Value("bool"),
+                }
+            ),
+            description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
+        ),
+        EthicsConfig(
+            name="deontology",
+            prefix="deontology",
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "excuse": datasets.Value("string"),
+                }
+            ),
+            description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
+        ),
+        EthicsConfig(
+            name="justice",
+            prefix="justice",
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                }
+            ),
+            description="The Justice subset contains examples focusing on how a character treats another person",
+        ),
+        EthicsConfig(
+            name="utilitarianism",
+            prefix="util",
+            features=datasets.Features(
+                {
+                    "activity": datasets.Value("string"),
+                    "baseline": datasets.Value("string"),
+                    "rating": datasets.Value("string"),  # Empty rating.
+                }
+            ),
+            description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
+        ),
+        EthicsConfig(
+            name="virtue",
+            prefix="virtue",
+            features=datasets.Features(
+                {
+                    "group_id": datasets.Value("int32"),
+                    "label": datasets.Value("int32"),
+                    "scenario": datasets.Value("string"),
+                    "trait": datasets.Value("string"),
+                }
+            ),
+            description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=f"{_DESCRIPTION}\n{self.config.description}",
+            features=self.config.features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_train.csv",
+                    ),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir,
+                        "ethics",
+                        self.config.name,
+                        f"{self.config.prefix}_test.csv",
+                    ),
+                    "split": "test",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, newline="") as f:
+            if self.config.name == "utilitarianism":
+                contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
+            else:
+                contents = csv.DictReader(f)
+            # For subsets with grouped scenarios, tag them with an id.
+            group_id = 0
+            for key, row in enumerate(contents):
+                if self.config.name == "deontology":
+                    # Scenarios come in groups of 4.
+                    if key % 4 == 0 and key != 0:
+                        group_id += 1
+                    yield key, {
+                        "group_id": group_id,
+                        "label": row["label"],
+                        "scenario": row["scenario"],
+                        "excuse": row["excuse"],
+                    }
+                elif self.config.name == "justice":
+                    # Scenarios come in groups of 4.
+                    if key % 4 == 0 and key != 0:
+                        group_id += 1
+                    yield key, {
+                        "group_id": group_id,
+                        "label": row["label"],
+                        "scenario": row["scenario"],
+                    }
+                elif self.config.name == "commonsense":
+                    yield key, {
+                        "label": row["label"],
+                        "input": row["input"],
+                        "is_short": row["is_short"],
+                        "edited": row["edited"],
+                    }
+                elif self.config.name == "virtue":
+                    # Scenarios come in groups of 5.
+                    if key % 5 == 0 and key != 0:
+                        group_id += 1
+                    scenario, trait = row["scenario"].split(" [SEP] ")
+                    yield key, {
+                        "group_id": group_id,
+                        "label": row["label"],
+                        "scenario": scenario,
+                        "trait": trait,
+                    }
+                elif self.config.name == "utilitarianism":
+                    yield key, {
+                        "activity": row["activity"],
+                        "baseline": row["baseline"],
+                        "rating": "",
+                    }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29f58991d892e91ba5ab4743a5cbe322eae92ce0
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e949763e0877c0ee271e707dc4ae05159131e6d7
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/lambada_ja.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/lambada_ja.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0f1b9d0319a72dfcfb217612560ca20845021d5
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/lambada_ja.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/lambada_ja.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/lambada_ja.py
new file mode 100644
index 0000000000000000000000000000000000000000..78916a253c90f04b88ad5370661e9805240218df
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/lambada_ja.py
@@ -0,0 +1,147 @@
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# TODO: Address all TODOs and remove all explanatory comments
+"""LAMBADA (OpenAI) dataset."""
+
+import os
+import json
+
+import datasets
+
+
+_CITATION = """\
+@misc{
+    author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
+    title={The LAMBADA dataset},
+    DOI={10.5281/zenodo.2630551},
+    publisher={Zenodo},
+    year={2016},
+    month={Aug}
+}
+"""
+
+_DESCRIPTION = """\
+The LAMBADA dataset as processed by OpenAI. It is used to evaluate the capabilities
+of computational models for text understanding by means of a word prediction task.
+LAMBADA is a collection of narrative texts sharing the characteristic that human subjects
+are able to guess their last word if they are exposed to the whole text, but not
+if they only see the last sentence preceding the target word. To succeed on LAMBADA,
+computational models cannot simply rely on local context, but must be able to keep track
+of information in the broader discourse.
+
+Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
+"""
+
+_HOMEPAGE = "https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = "Modified MIT"
+
+_BASE_URL = (
+    "https://huggingface.co/datasets/EleutherAI/lambada_openai/resolve/main/data"
+)
+
+JA_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "lambada_test_ja.jsonl"
+)
+_URLS = {
+    "default": f"{_BASE_URL}/lambada_test.jsonl",
+    "de": f"{_BASE_URL}/lambada_test_de.jsonl",
+    "en": f"{_BASE_URL}/lambada_test_en.jsonl",
+    "es": f"{_BASE_URL}/lambada_test_es.jsonl",
+    "fr": f"{_BASE_URL}/lambada_test_fr.jsonl",
+    "it": f"{_BASE_URL}/lambada_test_it.jsonl",
+    "ja": "https://gist.githubusercontent.com/mkshing/22b4623233940b2baa2f924e60f9b287/raw/c2c58325f5bc599818fe5f7d6f6b9af3e7699ed6/lambada_test_ja.jsonl",
+}
+
+
+class LambadaOpenAI(datasets.GeneratorBasedBuilder):
+    """LAMBADA is a dataset to evaluate the capabilities of computational models for text understanding by means of a word prediction task."""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="default",
+            version=VERSION,
+            description="Pre-processed English LAMBADA dataset from OpenAI",
+        ),
+        datasets.BuilderConfig(
+            name="de",
+            version=VERSION,
+            description="The German translated LAMBADA OpenAI dataset",
+        ),
+        datasets.BuilderConfig(
+            name="en",
+            version=VERSION,
+            description="The English translated LAMBADA OpenAI dataset",
+        ),
+        datasets.BuilderConfig(
+            name="es",
+            version=VERSION,
+            description="The Spanish translated LAMBADA OpenAI dataset",
+        ),
+        datasets.BuilderConfig(
+            name="fr",
+            version=VERSION,
+            description="The French translated LAMBADA OpenAI dataset",
+        ),
+        datasets.BuilderConfig(
+            name="it",
+            version=VERSION,
+            description="The Italian translated LAMBADA OpenAI dataset",
+        ),
+        datasets.BuilderConfig(
+            name="ja",
+            version=VERSION,
+            description="The Japanese translated LAMBADA OpenAI dataset",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "default"
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "text": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=f"{_DESCRIPTION}\n{self.config.description}",
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS[self.config.name]
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "test",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                yield key, {"text": data["text"]}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d91c73b7ddbacaac4869652da576b720414f1e47
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/logiqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/logiqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c3535cff8c005d29e17f42b34a0e3767d51c34f
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/logiqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..12a203cb05f9c71c0a135077ec9cae5957592ec7
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/dataset_infos.json
@@ -0,0 +1 @@
+{"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/logiqa.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/logiqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f5521596e502578e95fa9240ead23aea8850ec
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/logiqa.py
@@ -0,0 +1,124 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LogiQA dataset."""
+
+
+import datasets
+
+
+_CITATION = """\
+@misc{liu2020logiqa,
+    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
+    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
+    year={2020},
+    eprint={2007.08124},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """\
+LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
+instances, covering multiple types of deductive reasoning. Results show that state-
+of-the-art neural models perform by far worse than human ceiling. The dataset can
+also serve as a benchmark for reinvestigating logical AI under the deep learning
+NLP setting.
+"""
+
+_HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = {
+    "train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt",
+    "validation": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt",
+    "test": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt",
+}
+
+
+class Logiqa(datasets.GeneratorBasedBuilder):
+    """LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning"""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="logiqa", version=VERSION, description="The LogiQA dataset."
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "label": datasets.Value("string"),
+                "context": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "options": datasets.features.Sequence(datasets.Value("string")),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = {
+            "train": _URLS["train"],
+            "test": _URLS["test"],
+            "validation": _URLS["validation"],
+        }
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir["train"],
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir["validation"],
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        def normalize(text):
+            return text.replace(".", ". ").strip()
+
+        with open(filepath, encoding="utf-8") as f:
+            data = f.read().strip().split("\n\n")
+            for key, row in enumerate(data):
+                example = row.split("\n")
+                yield key, {
+                    "label": example[0].strip(),
+                    "context": normalize(example[1]),
+                    "question": normalize(example[2]),
+                    "options": [normalize(option[2:]) for option in example[3:]],
+                }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9adc65a09f44220701732b0519c4744590fe0501
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/mutual.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/mutual.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc5fe6a7e50243539e372aab26c9083b6f0a9639
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/mutual.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8c438b3f85d96a670fad128cbc555a2ab22c7fd
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/dataset_infos.json
@@ -0,0 +1 @@
+{"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/mutual.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/mutual.py
new file mode 100644
index 0000000000000000000000000000000000000000..c519e663bea10d4097bcaad4299148ce152f25a9
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/mutual.py
@@ -0,0 +1,136 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MuTual dataset."""
+
+
+import json
+import os
+from pathlib import Path
+
+import datasets
+
+
+_CITATION = """\
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+"""
+
+_DESCRIPTION = """\
+MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
+modified from Chinese high school English listening comprehension test data.
+"""
+
+_HOMEPAGE = "https://github.com/Nealcly/MuTual"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = "https://github.com/Nealcly/MuTual/archive/master.zip"
+
+
+class Mutual(datasets.GeneratorBasedBuilder):
+    """MuTual: A Dataset for Multi-Turn Dialogue Reasoning"""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="mutual", version=VERSION, description="The MuTual dataset."
+        ),
+        datasets.BuilderConfig(
+            name="mutual_plus",
+            version=VERSION,
+            description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "answers": datasets.Value("string"),
+                "options": datasets.features.Sequence(datasets.Value("string")),
+                "article": datasets.Value("string"),
+                "id": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=f"{_DESCRIPTION}\n{self.config.description}",
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "train"
+                    ),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "test"
+                    ),
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "basepath": os.path.join(
+                        data_dir, "MuTual-master", "data", self.config.name, "dev"
+                    ),
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, basepath, split):
+        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
+        key = 0
+        for file in sorted(Path(basepath).iterdir()):
+            if file.suffix != ".txt":
+                continue
+            with open(file, "r", encoding="utf-8") as f:
+                data_str = f.read()
+                # Ignore the occasional empty file.
+                if not data_str:
+                    continue
+                data = json.loads(data_str)
+                yield key, {
+                    "answers": data["answers"],
+                    "options": data["options"],
+                    "article": data["article"],
+                    "id": data["id"],
+                }
+                key += 1
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3dd3fe197ddbd5b96386263675dc9aa15c09ef8
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/pile.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/pile.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8448382ae6438d0d22c810207c84456ca691a66
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/pile.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..69a87cc33fd151e611297a8637e61e638cf6c955
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/dataset_infos.json
@@ -0,0 +1 @@
+{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/pile.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/pile.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6384bab5defe8d797d7a117d89ad4cfb9c30900
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/pile.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pile dataset."""
+
+
+import json
+
+import datasets
+
+
+_CITATION = """\
+@article{pile,
+  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
+  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
+  journal={arXiv preprint arXiv:2101.00027},
+  year={2020}
+}
+"""
+
+_DESCRIPTION = """\
+The Pile is a 825 GiB diverse, open source language modeling data set that consists
+of 22 smaller, high-quality datasets combined together. To score well on Pile
+BPB (bits per byte), a model must be able to understand many disparate domains
+including books, github repositories, webpages, chat logs, and medical, physics,
+math, computer science, and philosophy papers.
+"""
+
+_HOMEPAGE = "https://pile.eleuther.ai/"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = {
+    "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",
+    "test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
+}
+
+_NAMES = {
+    "pile_arxiv": "ArXiv",
+    "pile_books3": "Books3",
+    "pile_bookcorpus2": "BookCorpus2",
+    "pile_dm-mathematics": "DM Mathematics",
+    "pile_enron": "Enron Emails",
+    "pile_europarl": "EuroParl",
+    "pile_freelaw": "FreeLaw",
+    "pile_github": "Github",
+    "pile_gutenberg": "Gutenberg (PG-19)",
+    "pile_hackernews": "HackerNews",
+    "pile_nih-exporter": "NIH ExPorter",
+    "pile_opensubtitles": "OpenSubtitles",
+    "pile_openwebtext2": "OpenWebText2",
+    "pile_philpapers": "PhilPapers",
+    "pile_pile-cc": "Pile-CC",
+    "pile_pubmed-abstracts": "PubMed Abstracts",
+    "pile_pubmed-central": "PubMed Central",
+    "pile_stackexchange": "StackExchange",
+    "pile_upsto": "USPTO Backgrounds",
+    "pile_ubuntu-irc": "Ubuntu IRC",
+    "pile_wikipedia": "Wikipedia (en)",
+    "pile_youtubesubtitles": "YoutubeSubtitles",
+}
+
+
+class Pile(datasets.GeneratorBasedBuilder):
+    """The Pile is a 825 GiB diverse, open source language modeling dataset."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name=name, version=version, description=_NAMES[name])
+        for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES))
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "text": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=f"{_DESCRIPTION}\n{self.config.description}",
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = {"validation": _URLS["validation"], "test": _URLS["test"]}
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir["validation"],
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                if data["meta"]["pile_set_name"] == _NAMES[self.config.name]:
+                    yield key, {
+                        "text": data["text"],
+                    }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1d3a439b8aad82069b3a7c303d10b92994e8dbc
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/quac.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/quac.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f354a0fb2bec433151e0535004aa5264458c32a
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/quac.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..86fe853167145358addae0443729646dfe9585ec
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/dataset_infos.json
@@ -0,0 +1 @@
+{"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/quac.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/quac.py
new file mode 100644
index 0000000000000000000000000000000000000000..4328ec89eba5f95588f9ddb7c67409e040ce89f6
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/quac.py
@@ -0,0 +1,117 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# TODO: Address all TODOs and remove all explanatory comments
+"""QuAC dataset."""
+
+
+import json
+
+import datasets
+
+
+_CITATION = """\
+@article{choi2018quac,
+    title={Quac: Question answering in context},
+    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
+    journal={arXiv preprint arXiv:1808.07036},
+    year={2018}
+}
+"""
+
+_DESCRIPTION = """\
+Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
+participating in information seeking dialog. Data instances consist of an interactive
+dialog between two crowd workers: (1) a student who poses a sequence of freeform
+questions to learn as much as possible about a hidden Wikipedia text, and (2)
+a teacher who answers the questions by providing short excerpts (spans) from the text.
+"""
+
+_HOMEPAGE = "https://quac.ai/"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_URLS = {
+    "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
+    "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
+}
+
+
+class Quac(datasets.GeneratorBasedBuilder):
+    """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and  participating in information seeking dialog."""
+
+    VERSION = datasets.Version("1.1.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="quac", version=VERSION, description="The QuAC dataset"
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "title": datasets.Value("string"),
+                "section_title": datasets.Value("string"),
+                "paragraph": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir["train"],
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)["data"]
+            key = 0
+            for row in data:
+                paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
+                qas = row["paragraphs"][0]["qas"]
+                qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
+                for (question, answer) in qa_pairs:
+                    # Yields examples as (key, example) tuples
+                    yield key, {
+                        "title": row["title"],
+                        "section_title": row["section_title"],
+                        "paragraph": paragraph,
+                        "question": question,
+                        "answer": answer,
+                    }
+                    key += 1
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d48aecbb5b2c579f51b441e67a8af1ecce975afa
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/sat_analogies.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/sat_analogies.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98ea83c6ac6d5fae8fb57115d21604d1540d08a1
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/sat_analogies.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/sat_analogies.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/sat_analogies.py
new file mode 100644
index 0000000000000000000000000000000000000000..76ae9af713723c08afecc0b6b3b7a4f9befd0ba4
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/sat_analogies.py
@@ -0,0 +1,128 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SAT Analogy Questions dataset."""
+
+
+import os
+
+import datasets
+
+
+_CITATION = """\
+@article{article,
+    author = {Turney, Peter},
+    year = {2006},
+    month = {09},
+    pages = {379-416},
+    title = {Similarity of Semantic Relations},
+    volume = {32},
+    journal = {Computational Linguistics},
+    doi = {10.1162/coli.2006.32.3.379}
+}
+"""
+
+_DESCRIPTION = """\
+SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
+multiple-choice analogy questions; 5 choices per question.
+"""
+
+_HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+
+class SatAnalogies(datasets.GeneratorBasedBuilder):
+    """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="sat_analogies",
+            version=VERSION,
+            description="The SAT Analogy Questions dataset",
+        ),
+    ]
+
+    @property
+    def manual_download_instructions(self):
+        return (
+            "To use SAT Analogy Questions you have to download it manually. Please "
+            "email Peter Turney to request the data (https://www.apperceptual.com). "
+            "Once you receive a download link for the dataset, supply the local path "
+            "as the `data_dir` arg: "
+            "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
+        )
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "source": datasets.Value("string"),
+                "stem": datasets.Value("string"),
+                "choices": datasets.features.Sequence(datasets.Value("string")),
+                "solution": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
+        if not os.path.exists(data_dir):
+            raise FileNotFoundError(
+                f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
+            )
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "SAT-package-V3.txt"),
+                },
+            )
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath):
+        data = []
+        with open(filepath, "r", encoding="utf-8") as f:
+            record = []
+            for line in f:
+                line = line.strip()
+                if len(line) == 0 and record:
+                    data.append(record)
+                    record = []
+                elif len(line) > 0 and line[0] == "#":
+                    # Skip comments.
+                    continue
+                else:
+                    record.append(line)
+            data.append(record)
+        for key, record in enumerate(data):
+            source = record[-8]
+            stem = record[-7]
+            choices = record[-6:-1]
+            solution = record[-1]
+            yield key, {
+                "source": source,
+                "stem": stem,
+                "choices": choices,
+                "solution": solution,
+            }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/README.md b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7566ac3fde0940a50d018ed0d5f14e4cbe44432
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/README.md
@@ -0,0 +1,40 @@
+---
+dataset_info:
+  features:
+  - name: question_id
+    dtype: string
+  - name: question_source
+    dtype: string
+  - name: question
+    dtype: string
+  - name: answer
+    struct:
+    - name: aliases
+      sequence: string
+    - name: value
+      dtype: string
+  - name: search_results
+    sequence:
+    - name: description
+      dtype: string
+    - name: filename
+      dtype: string
+    - name: rank
+      dtype: int32
+    - name: title
+      dtype: string
+    - name: url
+      dtype: string
+    - name: search_context
+      dtype: string
+  config_name: triviaqa
+  splits:
+  - name: train
+    num_bytes: 1270894387
+    num_examples: 87622
+  - name: validation
+    num_bytes: 163755044
+    num_examples: 11313
+  download_size: 632549060
+  dataset_size: 1434649431
+---
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b36079cb36b8ea551de8d7431d2242cdecb316b0
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/triviaqa.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/triviaqa.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b82a47fc4b7e95e9989c31c1468269a3103d905e
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/__pycache__/triviaqa.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..87f4e064cfea01354dbc69b1c784553fce783172
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/dataset_infos.json
@@ -0,0 +1 @@
+{"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/triviaqa.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/triviaqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..40aba8c1dc4374c05f60d052d089dbc82c8e1d97
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/triviaqa/triviaqa.py
@@ -0,0 +1,157 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Custom TriviaQA because HF version sanitizes the dataset differently.
+# https://github.com/huggingface/datasets/blob/9977ade72191ff0b6907ec63935448c6269a91a1/datasets/trivia_qa/trivia_qa.py#L285
+"""TriviaQA (Unfiltered Raw) dataset."""
+
+
+import json
+import os
+
+import datasets
+
+
+_CITATION = """\
+@InProceedings{JoshiTriviaQA2017,
+    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+    month = {July},
+    year = {2017},
+    address = {Vancouver, Canada},
+    publisher = {Association for Computational Linguistics},
+}
+"""
+
+_DESCRIPTION = """\
+TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
+triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
+and independently gathered evidence documents, six per question on average, that provide
+high quality distant supervision for answering the questions.
+"""
+
+_HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"
+
+_LICENSE = "Apache License 2.0"
+
+_URLS = "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz"
+
+
+class Triviaqa(datasets.GeneratorBasedBuilder):
+    """TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples"""
+
+    VERSION = datasets.Version("0.0.2")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="triviaqa", version=VERSION, description="The TriviaQA dataset"
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "question_id": datasets.Value("string"),
+                "question_source": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "answer": {
+                    "aliases": datasets.features.Sequence(
+                        datasets.Value("string"),
+                    ),
+                    "value": datasets.Value("string"),
+                },
+                "search_results": datasets.features.Sequence(
+                    {
+                        "description": datasets.Value("string"),
+                        "filename": datasets.Value("string"),
+                        "rank": datasets.Value("int32"),
+                        "title": datasets.Value("string"),
+                        "url": datasets.Value("string"),
+                        "search_context": datasets.Value("string"),
+                    }
+                ),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = _URLS
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "triviaqa-unfiltered", "unfiltered-web-train.json"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, "triviaqa-unfiltered", "unfiltered-web-dev.json"
+                    ),
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath):
+        with open(filepath, encoding="utf-8") as f:
+            json_data = json.load(f)["Data"]
+            for key, data in enumerate(json_data):
+                search_results = []
+                for search_result in data["SearchResults"]:
+                    search_results.append(
+                        {
+                            "description": search_result["Description"]
+                            if "Description" in search_result
+                            else "",
+                            "filename": search_result["Filename"]
+                            if "Filename" in search_result
+                            else "",
+                            "rank": search_result["Rank"]
+                            if "Rank" in search_result
+                            else -1,
+                            "title": search_result["Title"]
+                            if "Title" in search_result
+                            else "",
+                            "url": search_result["Url"]
+                            if "Url" in search_result
+                            else "",
+                            "search_context": search_result["SearchContext"]
+                            if "SearchContext" in search_result
+                            else "",
+                        }
+                    )
+                yield key, {
+                    "question_id": data["QuestionId"],
+                    "question_source": data["QuestionSource"],
+                    "question": data["Question"],
+                    "answer": {
+                        "aliases": data["Answer"]["Aliases"],
+                        "value": data["Answer"]["Value"],
+                    },
+                    "search_results": search_results,
+                }
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__init__.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/__init__.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e29085edd3056f52c16bb28fc4ef15c0922e01
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/__init__.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/unscramble.cpython-310.pyc b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/unscramble.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a09928c8535772ec29c80f3a68b760dbf5dce29
Binary files /dev/null and b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/__pycache__/unscramble.cpython-310.pyc differ
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/dataset_infos.json b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/dataset_infos.json
new file mode 100644
index 0000000000000000000000000000000000000000..bae29209daf50ccb5bd6adc0779a2138cddb908b
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/dataset_infos.json
@@ -0,0 +1 @@
+{"mid_word_1_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_1_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 271516, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_1_anagrams.jsonl.gz": {"num_bytes": 106533, "checksum": "6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"}}, "download_size": 106533, "post_processing_size": null, "dataset_size": 271516, "size_in_bytes": 378049}, "mid_word_2_anagrams": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "mid_word_2_anagrams", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/mid_word_2_anagrams.jsonl.gz": {"num_bytes": 109091, "checksum": "c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"}}, "download_size": 109091, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 391745}, "cycle_letters_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "cycle_letters_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/cycle_letters_in_word.jsonl.gz": {"num_bytes": 98451, "checksum": "1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"}}, "download_size": 98451, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 381105}, "random_insertion_in_word": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "random_insertion_in_word", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 353981, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/random_insertion_in_word.jsonl.gz": {"num_bytes": 143626, "checksum": "72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"}}, "download_size": 143626, "post_processing_size": null, "dataset_size": 353981, "size_in_bytes": 497607}, "reversed_words": {"description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.\n", "citation": "@inproceedings{NEURIPS2020_1457c0d6,\n    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},\n    booktitle = {Advances in Neural Information Processing Systems},\n    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},\n    pages = {1877--1901},\n    publisher = {Curran Associates, Inc.},\n    title = {Language Models are Few-Shot Learners},\n    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},\n    volume = {33},\n    year = {2020}\n}\n", "homepage": "https://github.com/openai/gpt-3/tree/master/data", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "completion": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "unscramble", "config_name": "reversed_words", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 282654, "num_examples": 10000, "dataset_name": "unscramble"}}, "download_checksums": {"https://raw.githubusercontent.com/openai/gpt-3/master/data/reversed_words.jsonl.gz": {"num_bytes": 91917, "checksum": "133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"}}, "download_size": 91917, "post_processing_size": null, "dataset_size": 282654, "size_in_bytes": 374571}}
diff --git a/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/unscramble.py b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/unscramble.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f1e1ef5cbabf4ae57368e0d4fd00b126d4e37e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/unscramble/unscramble.py
@@ -0,0 +1,110 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unscramble dataset."""
+
+
+import json
+import os
+
+import datasets
+
+
+_CITATION = """\
+@inproceedings{NEURIPS2020_1457c0d6,
+    author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
+    pages = {1877--1901},
+    publisher = {Curran Associates, Inc.},
+    title = {Language Models are Few-Shot Learners},
+    url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
+    volume = {33},
+    year = {2020}
+}
+"""
+
+_DESCRIPTION = """\
+Unscramble is a small battery of 5 “character manipulation” tasks. Each task
+involves giving the model a word distorted by some combination of scrambling,
+addition, or deletion of characters, and asking it to recover the original word.
+"""
+
+_HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = ""
+
+_BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
+
+
+_DESCRIPTIONS = {
+    "mid_word_1_anagrams": "Anagrams of all but the first and last letter.",
+    "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.",
+    "cycle_letters_in_word": "Cycle letters in the word.",
+    "random_insertion_in_word": "Random insertions in the word that must be removed.",
+    "reversed_words": "Words spelled backwards that must be reversed.",
+}
+_NAMES = _DESCRIPTIONS.keys()
+
+
+class Unscramble(datasets.GeneratorBasedBuilder):
+    """Unscramble is a small battery of 5 “character manipulation” tasks."""
+
+    VERSION = datasets.Version("0.0.1")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name=name, version=version, description=_DESCRIPTIONS[name]
+        )
+        for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
+    ]
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "context": datasets.Value("string"),
+                "completion": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz")
+        data_dir = dl_manager.download_and_extract(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "validation",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    def _generate_examples(self, filepath, split):
+        with open(filepath, encoding="utf-8") as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                yield key, {
+                    "context": data["context"],
+                    "completion": data["completion"],
+                }
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/compute_average_from_json.py b/scripts/yans/eval/lm-evaluation-harness/scripts/compute_average_from_json.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af9986dbfaaf5d42fdad40131c110320197f139
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/compute_average_from_json.py
@@ -0,0 +1,58 @@
+import os
+import json
+import argparse
+import pandas as pd
+import blobfile as bf
+
+
+def _list_json_files_recursively(data_dir):
+    results = []
+    for entry in sorted(bf.listdir(data_dir)):
+        full_path = bf.join(data_dir, entry)
+        ext = entry.split(".")[-1]
+        if "." in entry and ext.lower() in ["json"]:
+            results.append(full_path)
+        elif bf.isdir(full_path):
+            results.extend(_list_json_files_recursively(full_path))
+    return results
+
+
+MODE2TASKS = {"jglue": ["jcommonsenseqa", "jnli", "marc_ja", "jsquad"]}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_path", required=True, type=str, help="could be .json or dir"
+    )
+    parser.add_argument("--mode", default="all", choices=["jglue", "all"])
+    args = parser.parse_args()
+    if os.path.isfile(args.output_path):
+        files = [args.output_path]
+    else:
+        files = _list_json_files_recursively(args.output_path)
+
+    for file in files:
+        with open(file) as f:
+            info = json.load(f)
+        results = info["results"]
+        data = {"model_name": os.path.basename(file).replace(".json", "")}
+        scores = []
+        for k, v in results.items():
+            if args.mode != "all":
+                task_name = k.split("-")[0]
+                if task_name not in MODE2TASKS[args.mode]:
+                    continue
+            if "acc" in v:
+                # to percent
+                score = v["acc"] * 100
+            elif "exact_match" in v:
+                score = v["exact_match"]
+            elif "rouge2" in v:
+                score = v["rouge2"]
+            else:
+                NotImplementedError(v.keys())
+            data[k] = round(score, 2)
+            scores.append(score)
+        data["average"] = round(sum(scores) / (len(scores) * 100) * 100, 2)
+        dumped = json.dumps(data, indent=2)
+        print(dumped)
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/generate_harness.py b/scripts/yans/eval/lm-evaluation-harness/scripts/generate_harness.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd01085d7e91c0b05365786f28a419e34fc61b21
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/generate_harness.py
@@ -0,0 +1,132 @@
+import os
+import sys
+import configparser
+from pathlib import Path
+import argparse
+
+"""
+A script to generate a harness shell script for running eval in a cluster.
+
+Given a model directory, this script looks for a `harness.conf` file. The file describes the path to the pretrained model (or the HuggingFace model name), the model arguments, the tasks to run, and other details.
+
+Example usage:
+
+    python scripts/generate_harness.py models/stablelm/stablelm-jp-3b-ja50_rp50-700b/
+
+Given these details the script outputs a command line which will run eval.
+
+In detail, a config has two kinds of sections: the `[model]` section contains general model options, while task-specific sections have names like `[tasks.MY_TASK-0.1]`.
+
+The model section contains the following values:
+
+- model: almost always hf-causal
+- path: the first arg to AutoModel.load_model
+- tokenizer: path to the tokenizer
+- args: any other model arguments
+
+You can use interpolation in the config file, so it's commmon to include a `project_dir` value and use it like this:
+
+    tokenizer = ${project_dir}/tokenizers/hogehoge
+
+Task sections just contain the `fewshot` parameter for now.
+
+Configs support inheritance. The file at `models/harness.conf` specifies global defaults, while files for each model group can define options not specific for a model. These files are usually the most detailed, since task and prompt selection are usually consistent for a model group.
+"""
+
+
+def generate_harness(path):
+
+    # grandparent is global config, parent is org config.
+    # it's ok if they don't exist.
+    hc = "harness.conf"
+    config_paths = [path.parent.parent / hc, path.parent / hc, path / hc]
+    # used to make PROJECT_DIR work just like shell
+    interp = configparser.ExtendedInterpolation()
+    conf = configparser.ConfigParser(interpolation=interp)
+    conf.read(config_paths)
+
+    # Build the model args. We don't have to interpolate the path here, it can
+    # be handled in the file directly thanks to interpolation. Also, fall back
+    # to the last two parts of the directory path as the HF name.
+    fallback_path = os.path.join(*path.parts[-2:])
+
+    model_path = conf["model"].get("path", fallback_path)
+    # tokenizer is technically not required, but almost always present
+    tokenizer = conf["model"].get("tokenizer")
+    # args are technically not required
+    args = conf["model"].get("args")
+    model_args = f"pretrained={model_path}"
+
+    if tokenizer in ("''", '""'):
+        tokenizer = None
+    if tokenizer:
+        model_args += "," + f"tokenizer={tokenizer}"
+
+    if args in ("''", '""'):
+        args = None
+    if args:
+        model_args += "," + args
+
+    # Make the task list. Basically just attaching prompt versions, but some tasks
+    # have no prompt versions because they don't use prompts.
+    tasks = []
+    fewshot = []
+    for key, val in conf.items():
+        if not key.startswith("tasks."):
+            continue
+
+        name = key.split(".", 1)[1]
+        prompt = val["prompt"]
+        # By default configparser doesn't handle empty strings
+        # properly, so check for obvious attempts and fix them.
+        if prompt in ("''", '""'):
+            prompt = ""
+        if prompt:
+            name = f"{name}-{prompt}"
+        tasks.append(name)
+        fewshot.append(val["fewshot"])
+
+    tasks = ",".join(tasks)
+    fewshot = ",".join(fewshot)
+
+    output_path = path / "result.json"
+    model_type = conf["model"]["model"]
+    script = (
+        "python main.py "
+        f"--device cuda "
+        f"--model {model_type} "
+        f"--model_args {model_args} "
+        f"--tasks {tasks} "
+        f"--num_fewshot {fewshot} "
+        f"--output_path {output_path} "
+    )
+    return script.strip()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_harness.py",
+        description="Generate an eval command based on configs.",
+    )
+
+    parser.add_argument("model_path")
+    parser.add_argument(
+        "-w",
+        "--write",
+        action="store_true",
+        help="write harness script to default location",
+    )
+
+    args = parser.parse_args()
+    path = Path(args.model_path).absolute()
+    if path.is_file():
+        # if we specified a file for some reason, just take the dir
+        path = path.parent
+
+    cmd = generate_harness(path)
+    print(cmd)
+    if args.write:
+        opath = path / "harness.sh"
+        with open(opath, "w") as ofile:
+            ofile.write(cmd)
+        print(f"wrote script to: {opath}")
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/get_prompts.py b/scripts/yans/eval/lm-evaluation-harness/scripts/get_prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e2f89c1385f5e767e6da20d46380465c9e5c44
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/get_prompts.py
@@ -0,0 +1,23 @@
+from lm_eval import tasks
+from itertools import islice
+
+ct = 3
+
+for (
+    tname,
+    Task,
+) in tasks.TASK_REGISTRY.items():  # [('record', tasks.superglue.ReCoRD)]:#
+    task = Task()
+
+    print("#", tname)
+    docs = islice(
+        task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct
+    )
+    print()
+    for i in range(ct):
+        print()
+        doc = next(docs)
+        print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
+        print()
+        print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n")
+        print()
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/make_gpt2_test_cases.py b/scripts/yans/eval/lm-evaluation-harness/scripts/make_gpt2_test_cases.py
new file mode 100644
index 0000000000000000000000000000000000000000..361bc2ecd673d39afabe9c941cb242ada3e14752
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/make_gpt2_test_cases.py
@@ -0,0 +1,47 @@
+import transformers
+
+import torch
+import torch.nn.functional as F
+import random
+
+random.seed(42)
+
+
+data = [
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
+    'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]',
+    "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
+    "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
+    "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
+    "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.",
+    "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
+    "Hello World",
+]
+
+
+model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
+tok = transformers.GPT2Tokenizer.from_pretrained("gpt2")
+
+tgs = []
+
+for dat in data:
+    random.seed(dat)
+    # print(model(tok.encode(dat, return_tensors="pt"))[0][0])
+
+    toks = tok.encode(dat, return_tensors="pt")
+    ind = random.randrange(len(toks[0]) - 1)
+    logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]
+
+    res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
+
+    tgs.append(float(res[ind:].sum()))
+    print(
+        r'("""'
+        + tok.decode(toks[0, : ind + 1])
+        + r'""", """'
+        + tok.decode(toks[0, ind + 1 :])
+        + r'"""), '
+    )
+
+print(tgs)
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/make_leaderboard.py b/scripts/yans/eval/lm-evaluation-harness/scripts/make_leaderboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfa44bcaa1103a0acd9e4cc220397f10d220e758
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/make_leaderboard.py
@@ -0,0 +1,146 @@
+import os
+from pathlib import Path
+import json
+import pandas as pd
+import blobfile as bf
+
+
+OTHERS = {
+    "pretrained=abeja/gpt-neox-japanese-2.7b": "abeja-gpt-neox-japanese-2.7b",
+}
+RINNA = {
+    "pretrained=rinna/japanese-gpt-1b,use_fast=False": "rinna-japanese-gpt-1b",
+    "pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False": "rinna-japanese-gpt-neox-3.6b",
+    "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False": "rinna-japanese-gpt-neox-3.6b-instruction-ppo",
+    "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft,use_fast=False": "rinna-japanese-gpt-neox-3.6b-instruction-sft",
+    "pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft-v2,use_fast=False": "rinna-japanese-gpt-neox-3.6b-instruction-sft-v2",
+}
+CYBERAGENT = {
+    "pretrained=cyberagent/open-calm-medium": "cyberagent-open-calm-medium",
+    "pretrained=cyberagent/open-calm-large": "cyberagent-open-calm-large",
+    "pretrained=cyberagent/open-calm-1b": "cyberagent-open-calm-1b",
+    "pretrained=cyberagent/open-calm-3b": "cyberagent-open-calm-3b",
+    "pretrained=cyberagent/open-calm-7b": "cyberagent-open-calm-7b",
+}
+MODELARGS2ID = {**OTHERS, **RINNA, **CYBERAGENT}
+
+TASK2MAINMETRIC = {
+    "jcommonsenseqa": "acc",
+    "jnli": "acc",
+    "marc_ja": "acc",
+    "jsquad": "exact_match",
+    "jaquad": "exact_match",
+    "xlsum_ja": "rouge2",
+}
+TASK2SHOT = {
+    "jcommonsenseqa": 2,
+    "jnli": 3,
+    "marc_ja": 3,
+    "jsquad": 3,
+    "jaquad": 3,
+    "xlsum_ja": 1,
+}
+
+
+def get_class(model_args):
+    if model_args in RINNA:
+        return "rinna"
+    elif model_args in CYBERAGENT:
+        return "cyberagent"
+    elif model_args in OTHERS:
+        return ""
+    else:
+        raise NotImplementedError
+
+
+def get_score(metric: str, value):
+    if metric == "acc":
+        return value * 100
+    return value
+
+
+def _list_json_files_recursively(data_dir):
+    results = []
+    for entry in sorted(bf.listdir(data_dir)):
+        full_path = bf.join(data_dir, entry)
+        ext = entry.split(".")[-1]
+        if "." in entry and ext.lower() in ["json"]:
+            results.append(full_path)
+        elif bf.isdir(full_path):
+            results.extend(_list_json_files_recursively(full_path))
+    return results
+
+
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+
+
+models_dir = "models"
+url_repo = "https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/{}"
+url_hf = "https://huggingface.co/{org}/{model_name}"
+
+files = _list_json_files_recursively(models_dir)
+
+res_dict = {}
+
+
+def add_data(key, value):
+    global res_dict
+    if key not in res_dict:
+        res_dict[key] = [value]
+    else:
+        res_dict[key].append(value)
+
+
+for file in files:
+    if "experiments" in file or "community" in file:
+        continue
+    with open(file) as f:
+        info = json.load(f)
+    results = info["results"]
+    # only 8 tasks
+    if len(results) != 8:
+        continue
+    model_id = os.path.basename(os.path.dirname(file))
+    org = model_id.split("-")[0]
+    model_name = model_id[len(org) + 1 :]
+    add_data(
+        "model",
+        model_hyperlink(url_hf.format(org=org, model_name=model_name), model_id),
+    )
+    p = os.path.join(os.path.dirname(file), "harness.sh")
+    add_data("eval script", model_hyperlink(url_repo.format(p), p))
+    scores = []
+    for k, v in results.items():
+        if "acc" in v:
+            # to percent
+            score = v["acc"] * 100
+        elif "exact_match" in v:
+            score = v["exact_match"]
+        elif "rouge2" in v:
+            score = v["rouge2"]
+        else:
+            NotImplementedError(v.keys())
+        k = k.split("-")[0]
+        add_data(k, round(score, 2))
+        scores.append(score)
+    add_data("average", round(sum(scores) / (len(scores) * 100) * 100, 2))
+df = pd.DataFrame.from_dict(res_dict)
+df = df[
+    [
+        "model",
+        "average",
+        "jcommonsenseqa",
+        "jnli",
+        "marc_ja",
+        "jsquad",
+        "jaqket_v2",
+        "xlsum_ja",
+        "xwinograd_ja",
+        "mgsm",
+        "eval script",
+    ]
+]
+df.sort_values(by=["average"], inplace=True, ascending=False)
+df.to_csv("jp_llm_leaderboard.csv", index=False)
+df.to_markdown("jp_llm_leaderboard.md", index=False)
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/models.txt b/scripts/yans/eval/lm-evaluation-harness/scripts/models.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ba6f0533dbf2f25b7034e9f30004a5101139146
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/models.txt
@@ -0,0 +1,11 @@
+rinna/rinna-japanese-gpt-neox-3.6b-instruction-ppo pretrained=rinna/japanese-gpt-neox-3.6b-instruction-ppo,use_fast=False
+rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft,use_fast=False
+rinna/rinna-japanese-gpt-1b pretrained=rinna/japanese-gpt-1b,use_fast=False
+rinna/rinna-japanese-gpt-neox-3.6b-instruction-sft-v2 pretrained=rinna/japanese-gpt-neox-3.6b-instruction-sft-v2,use_fast=False
+rinna/rinna-japanese-gpt-neox-3.6b pretrained=rinna/japanese-gpt-neox-3.6b,use_fast=False
+abeja-gpt-neox-japanese-2.7b pretrained=abeja/gpt-neox-japanese-2.7b
+cyberagent/cyberagent-open-calm-medium pretrained=cyberagent/open-calm-medium,use_fast=True
+cyberagent/cyberagent-open-calm-3b pretrained=cyberagent/open-calm-3b
+cyberagent/cyberagent-open-calm-7b pretrained=cyberagent/open-calm-7b
+cyberagent/cyberagent-open-calm-large pretrained=cyberagent/open-calm-large,use_fast=True
+cyberagent/cyberagent-open-calm-1b pretrained=cyberagent/open-calm-1b
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/notify.py b/scripts/yans/eval/lm-evaluation-harness/scripts/notify.py
new file mode 100644
index 0000000000000000000000000000000000000000..779574bc502dbf78a7f60998af5519d2ec5e9055
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/notify.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# This is an example of sending a slack notification. For more details see
+# official docs:
+# https://api.slack.com/messaging/webhooks
+
+import requests
+import json
+import os
+
+# This URL is tied to a single channel. That can be generalized, or you can
+# create a new "app" to use another channel.
+WEBHOOK = os.environ.get("WEBHOOK_URL")
+if WEBHOOK is None:
+    print("Webhook URL not found in WEBHOOK_URL env var. Will just print messages.")
+
+
+def notify(message):
+    headers = {"Content-Type": "application/json"}
+    data = json.dumps({"text": message})
+    if WEBHOOK is None:
+        print(message)
+    else:
+        requests.post(WEBHOOK, data=data, headers=headers)
+
+
+if __name__ == "__main__":
+    print("Please type your message.")
+    message = input("message> ")
+    notify(message)
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/run_eval.py b/scripts/yans/eval/lm-evaluation-harness/scripts/run_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..26cef8d4814b66a7ba2d51019fbcd3073f0d0c42
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/run_eval.py
@@ -0,0 +1,168 @@
+import submitit
+from submitit.helpers import CommandFunction
+import argparse
+from notify import notify
+from pathlib import Path
+
+from main_eval import main as main_eval
+
+# These are the standard 8 tasks
+JAEVAL8_TASKS = [
+    "jcommonsenseqa-1.1",
+    "jnli-1.1",
+    "marc_ja-1.1",
+    "jsquad-1.1",
+    "jaqket_v2-0.2",
+    "xlsum_ja-1.0",
+    "xwinograd_ja",
+    "mgsm-1.0",
+]
+JAEVAL8_FEWSHOT = [3, 3, 3, 2, 1, 1, 0, 5]
+
+
+def eval_task():
+    args = {
+        "tasks": ["jsquad-1.1-0.2"],
+        "num_fewshot": [1],
+        "model": "hf-causal",
+        "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False",
+        "device": "cuda",
+        "limit": 100,
+        "verbose": True,
+    }
+
+    main_eval(args, output_path="./check.json")
+
+
+def build_executor(
+    name: str,
+    gpus_per_task: int,
+    cpus_per_gpu: int,
+    timeout: int = 0,
+    partition: str = "g40",
+    account: str = "stablegpt",
+):
+    base_args = {
+        # just "gpus" does not work
+        "slurm_gpus_per_task": 8,
+        "name": "eval",
+        "slurm_account": "stablegpt",
+        "slurm_partition": "g40",
+        "slurm_cpus_per_gpu": 12,
+        # Default timeout is 5 minutes???
+        "timeout_min": 0,
+    }
+
+    executor = submitit.AutoExecutor(folder="./logs")
+    executor.update_parameters(**base_args)
+    return executor
+
+
+def build_shell_script_task(script_path, args, repo):
+    """This is how you can wrap an existing harness.sh.
+
+    This is not currently used.
+    """
+    task = CommandFunction(["bash", args.harness_script], cwd=repo)
+    return task
+
+
+def run_job(executor, task, *args, **kwargs):
+    """Given an executor and a task, run the task with error reporting.
+
+    `executor` should be a submitit executor.
+
+    `task` can be a CommandFunction from submitit, which wraps a shell script,
+    or a Python function. Further positional or keyword arguments are passed to
+    the function.
+    """
+    job = executor.submit(task, *args, **kwargs)
+    print("Submitted job")
+    print("See log at:")
+    print(f"\t{job.paths.stdout}")
+
+    try:
+        output = job.result()
+        print("Job finished successfully!")
+        notify(f":white_check_mark: Eval Finished for `{job}`")
+        return output
+    except Exception as ee:  # noqa: F841
+        # submitit doesn't seem to have a parent class for their exceptions, so
+        # just catch everything. We want to be aware of any failure anyway.
+        # If this is noisy we can ignore certain kinds of early failures.
+
+        msg = f"""
+            :rotating_light: Eval failed for `{job}`
+
+            See `{job.paths.stderr}`
+            """.strip()
+        notify(msg)
+        raise
+
+
+def run_eval_shell_script():
+    parser = argparse.ArgumentParser(
+        prog="run-eval",
+        description="Run eval harness",
+    )
+
+    parser.add_argument("harness_script")
+
+    args = parser.parse_args()
+
+    base_args = {
+        # just "gpus" does not work
+        "slurm_gpus_per_task": 8,
+        "name": "eval",
+        "slurm_account": "stablegpt",
+        "slurm_partition": "g40",
+        "slurm_cpus_per_gpu": 12,
+        # Default timeout is 5 minutes???
+        "timeout_min": 0,
+    }
+
+    executor = submitit.AutoExecutor(folder="./logs")
+    executor.update_parameters(**base_args)
+
+    # Harness scripts expect the cwd to be the repo root
+    spath = Path(args.harness_script)
+    repo = str(spath.parent.parent.parent.parent)
+    print("repo path:", repo)
+    # the eval harness relies on validating cli args, so it's difficult to run
+    # directly from Python. Use the harness.sh scripts for now.
+    # Also note this needs to be a list of strings.
+    harness = CommandFunction(["bash", args.harness_script], cwd=repo)
+
+    job = executor.submit(harness)
+    print("Submitted job")
+    print("See log at:")
+    print(f"\t{job.paths.stdout}")
+
+    try:
+        output = job.result()
+        print("Job finished successfully!")
+        notify(f":white_check_mark: Eval Finished for `{args.harness_script}`")
+        return output
+    except Exception as ee:  # noqa: F841
+        # submitit doesn't seem to have a parent class for their exceptions, so
+        # just catch everything. We want to be aware of any failure anyway.
+        # If this is noisy we can ignore certain kinds of early failures.
+
+        msg = f"""
+            :rotating_light: Eval failed for `{args.harness_script}`
+
+            See `{job.paths.stderr}`
+            """.strip()
+        notify(msg)
+        raise
+
+
+def run_eval_submitit():
+    """Run evaluation using submitit."""
+    executor = build_executor()
+    # By wrapping everything in a function, we don't have to pass args.
+    run_job(executor, eval_task)
+
+
+if __name__ == "__main__":
+    run_eval_shell_script()
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/run_suite.py b/scripts/yans/eval/lm-evaluation-harness/scripts/run_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3fa20fc6986a88237554fd2159b1d03afed923a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/run_suite.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+# Run a suite of tests
+
+import argparse
+
+from lm_eval import evaluator
+from lm_eval.prompts import get_prompt_code
+from lm_eval.suites import TaskSpec, load_suite
+
+
+def build_eval_args(specs: list[TaskSpec], prompt: str) -> tuple[list[str], list[int]]:
+    """Convert list of TaskSpecs into args for simple_evaluate."""
+
+    tasks = []
+    fewshot = []
+    for spec in specs:
+        task_name = spec.name
+
+        code = get_prompt_code(prompt, task_name)
+
+        if spec.version is not None:
+            task_name += "-" + spec.version + "-" + code
+
+        tasks.append(task_name)
+        fewshot.append(spec.fewshot)
+
+    return (tasks, fewshot)
+
+
+def run_suite(
+    model_args,
+    suite,
+    prompt,
+    *,
+    model_type="hf-causal",
+    output=None,
+    verbose=False,
+    limit=None,
+):
+    # Confusing detail: in the "simple evaluate", "model" is the HF model type,
+    # which is almost always hf-causal or hf-causal-experimental. `model_args`
+    # looks like this:
+    #
+    #     pretrained=hoge/piyo,tokenizer=...,asdf=...
+
+    # device never changes in practice
+    device = "cuda"
+
+    specs = load_suite(suite)
+    tasks, num_fewshot = build_eval_args(specs, prompt)
+
+    evaluator.simple_evaluate(
+        model=model_type,
+        model_args=model_args,
+        tasks=tasks,
+        num_fewshot=num_fewshot,
+        device=device,
+        verbose=verbose,
+        limit=limit,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        prog="run_suite.py", description="Run a test suite with a model"
+    )
+    parser.add_argument("model", help="Model path (or HF spec)")
+    parser.add_argument("suite", help="Test suite to run")
+    parser.add_argument("prompt", help="Prompt to use")
+    parser.add_argument("-m", "--model_args", help="Additional model arguments")
+    parser.add_argument(
+        "-t", "--model_type", default="hf-causal-experimental", help="Model type"
+    )
+    parser.add_argument("-o", "--output", help="Output file")
+    parser.add_argument("-v", "--verbose", action="store_true")
+
+    # TODO would it be better to just use a "quick" setting that runs 10
+    # iterations? We don't need arbitrary numeric control
+    parser.add_argument(
+        "-l", "--limit", type=int, help="number of iterations to run (for testing)"
+    )
+
+    args = parser.parse_args()
+
+    margs = f"pretrained={args.model}"
+    if args.model_args:
+        margs = args.model + "," + args.model_args
+
+    run_suite(
+        margs,
+        args.suite,
+        args.prompt,
+        model_type=args.model_type,
+        output=args.output,
+        verbose=args.verbose,
+        limit=args.limit,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/run_task.sh b/scripts/yans/eval/lm-evaluation-harness/scripts/run_task.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a925a97e7d4eea0241c767e1872fa8fc7ad5b169
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/run_task.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Run a single eval task for a given model. Should be called from srun.
+# NOTE: requires a venv to be prepared.
+set -eou pipefail
+
+TASK=$1         # ex: xwinograd_ja
+FEWSHOT=$2      # ex: 0
+MODEL_ARGS="$3" # ex: pretrained=abeja/gpt-neox-japanese-2.7b
+# Note that model path is often slightly different from hf name
+MODEL_PATH=$4   # ex: abeja/gpt-neox-japanese-2.7b
+
+# assuems this is in the scripts/ dir of the harness repo
+cd $(dirname "$0")/..
+source env/bin/activate
+python main.py \
+	--model hf-causal \
+	--model_args $MODEL_ARGS \
+	--tasks $TASK \
+	--num_fewshot "$FEWSHOT" \
+	--device "cuda" \
+	--output_path "models/$MODEL_PATH/result.$TASK.json"
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/run_task_batch.sh b/scripts/yans/eval/lm-evaluation-harness/scripts/run_task_batch.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9a982b7646e3398b3732db4acde246cc77ef6cf9
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/run_task_batch.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH --account="stablegpt"
+#SBATCH --job-name="jp-eval"
+#SBATCH --partition=g40
+#SBATCH --cpus-per-task=12
+#SBATCH --gpus=1
+#SBATCH --mem-per-cpu=11G
+#SBATCH --output=./slurm_outs/%x_%j.out
+#SBATCH --error=./slurm_outs/%x_%j.err
+
+# This command just holds the sbatch config and should be called from
+# run_task_for_models.sh.
+set -eou pipefail
+
+# would be better if this wasn't relative to a home dir, but slurm runs a copy
+# of this script instead of the original, and finding the original path can get
+# a little complicated, so keeping it simple for now. This could be improved.
+cd ~/lm-evaluation-harness/scripts
+srun run_task.sh "$@"
diff --git a/scripts/yans/eval/lm-evaluation-harness/scripts/write_out.py b/scripts/yans/eval/lm-evaluation-harness/scripts/write_out.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfcfc0b607b443f7bc0011996b7539d017ce217e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/scripts/write_out.py
@@ -0,0 +1,81 @@
+import argparse
+import numpy as np
+import json
+import os
+import random
+from lm_eval import tasks
+from lm_eval.utils import join_iters
+
+EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_base_path", required=True)
+    parser.add_argument("--tasks", default="all_tasks")
+    parser.add_argument("--provide_description", action="store_true")
+    parser.add_argument("--sets", type=str, default="val")  # example: val,test
+    parser.add_argument("--num_fewshot", type=int, default=1)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_examples", type=int, default=1)
+    parser.add_argument("--description_dict_path", default=None)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    np.random.seed(args.seed)
+
+    if args.tasks == "all_tasks":
+        task_names = tasks.ALL_TASKS
+    else:
+        task_names = args.tasks.split(",")
+    task_dict = tasks.get_task_dict(task_names)
+
+    description_dict = {}
+    if args.description_dict_path:
+        with open(args.description_dict_path, "r") as f:
+            description_dict = json.load(f)
+
+    os.makedirs(args.output_base_path, exist_ok=True)
+    for task_name, task in task_dict.items():
+        rnd = random.Random()
+        rnd.seed(args.seed)
+
+        iters = []
+
+        for set in args.sets.split(","):
+            if set == "train" and task.has_training_docs():
+                docs = task.training_docs()
+            if set == "val" and task.has_validation_docs():
+                docs = task.validation_docs()
+            if set == "test" and task.has_test_docs():
+                docs = task.test_docs()
+            iters.append(docs)
+
+        docs = join_iters(iters)
+
+        description = (
+            description_dict[task_name]
+            if description_dict and task_name in description_dict
+            else ""
+        )
+
+        with open(os.path.join(args.output_base_path, task_name), "w") as f:
+            for i, doc in (
+                zip(range(args.num_examples), docs)
+                if args.num_examples > 0
+                else enumerate(docs)
+            ):
+                f.write(EXAMPLE_DIVIDER.format(i=i))
+                ctx = task.fewshot_context(
+                    doc=doc,
+                    num_fewshot=args.num_fewshot,
+                    rnd=rnd,
+                    description=description,
+                )
+                f.write(ctx + "\n")
+
+
+if __name__ == "__main__":
+    main()