diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_api.py b/scripts/yans/lm-evaluation-harness/tests/models/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bca2f7bdbc479d6c8c45171347f11dd8c8892d9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_api.py
@@ -0,0 +1,149 @@
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+
+
+@pytest.fixture
+def api():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
+    )
+
+
+@pytest.fixture
+def api_tokenized():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com",
+        model="EleutherAI/pythia-1b",
+        tokenizer_backend="huggingface",
+    )
+
+
+def test_create_payload_generate(api):
+    messages = ["Generate a story"]
+    gen_kwargs = {
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "until": ["The End"],
+        "do_sample": True,
+        "seed": 1234,
+    }
+    payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
+
+    assert payload == {
+        "prompt": ["Generate a story"],
+        "model": "gpt-3.5-turbo",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "stop": ["The End"],
+        "seed": 1234,
+    }
+
+
+def test_create_payload_loglikelihood(api):
+    messages = ["The capital of France is"]
+    payload = api._create_payload(messages, generate=False, gen_kwargs=None)
+
+    assert payload == {
+        "model": "gpt-3.5-turbo",
+        "prompt": ["The capital of France is"],
+        "max_tokens": 1,
+        "logprobs": 1,
+        "echo": True,
+        "temperature": 0,
+        "seed": 1234,
+    }
+
+
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            ["Hello, how are"],
+            True,
+            {"max_gen_toks": 100, "temperature": 0.7},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 100,
+                "temperature": 0.7,
+                "stop": ["<|endoftext|>"],
+                "seed": 1234,
+            },
+        ),
+        (
+            ["Hello, how are", "you"],
+            True,
+            {},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 256,
+                "temperature": 0,
+                "stop": ["<|endoftext|>"],
+                "seed": 1234,
+            },
+        ),
+    ],
+)
+def test_model_generate_call_usage(
+    api, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+
+        # Act
+        result = api.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}
+
+
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            [[1, 2, 3, 4, 5]],
+            False,
+            None,
+            {
+                "model": "EleutherAI/pythia-1b",
+                "prompt": [[1, 2, 3, 4, 5]],
+                "max_tokens": 1,
+                "logprobs": 1,
+                "echo": True,
+                "seed": 1234,
+                "temperature": 0,
+            },
+        ),
+    ],
+)
+def test_model_tokenized_call_usage(
+    api_tokenized, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+
+        # Act
+        result = api_tokenized.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}
diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py b/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e197e77418fe83e3cc1cf96e23223b80afe633
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py
@@ -0,0 +1,152 @@
+import hashlib
+import json
+import os
+import pickle
+import unittest
+from unittest.mock import patch
+
+from lm_eval.api.instance import Instance
+from lm_eval.models.gguf import GGUFLM
+
+
+base_url = "https://matthoffner-ggml-llm-api.hf.space"
+
+
+def gguf_completion_mock(base_url=None, **kwargs):
+    # Generate a hash from the parameters
+    hash_kwargs = {"base_url": base_url, **kwargs}
+    parameters_hash = hashlib.sha256(
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+
+    fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    else:
+        print("The file does not exist, attempting to write...")
+        if "stop" in kwargs:
+            result = {
+                "choices": [
+                    {
+                        "text": f"generated text until {kwargs['stop']}",
+                        "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
+                        "finish_reason": "length",
+                    }
+                ]
+            }
+        else:
+            # generated with # curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
+            result = {
+                "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
+                "object": "text_completion",
+                "created": 1700511361,
+                "model": "../llama-2-7b.Q8_0.gguf",
+                "choices": [
+                    {
+                        "text": "string(",
+                        "index": 0,
+                        "logprobs": {
+                            "text_offset": [0, 7],
+                            "token_logprobs": [None, -1.033263319857306],
+                            "tokens": [" string", "("],
+                            "top_logprobs": [
+                                None,
+                                {
+                                    "(": -1.033263319857306,
+                                    "[]": -2.6530743779017394,
+                                    ".": -3.0377145947291324,
+                                    "\n": -3.0399156750513976,
+                                    "_": -3.510376089937872,
+                                    " =": -3.6957918347193663,
+                                    ",": -3.9309459866358702,
+                                    " of": -4.2834550083949035,
+                                    '("': -4.322762841112799,
+                                    "()": -4.426229113466925,
+                                },
+                            ],
+                        },
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 2,
+                    "completion_tokens": 1,
+                    "total_tokens": 3,
+                },
+            }
+
+        try:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            print("Writing file at", fname)
+            with open(fname, "wb") as fh:
+                pickle.dump(result, fh)
+            print("File written successfully")
+        except Exception as e:
+            print("File writing failed:", e)
+
+        return result
+
+
+class GGUFLMTest(unittest.TestCase):
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_loglikelihood(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test loglikelihood
+        requests = [
+            Instance(
+                request_type="loglikelihood",
+                doc=args,
+                arguments=args,
+                idx=i,
+            )
+            for i, args in enumerate([("str", "ing"), ("str", "ing")])
+        ]
+        res = lm.loglikelihood(requests)
+
+        # Assert the loglikelihood response is correct
+        expected_res = [(logprob, True) for logprob in [0, 0]]
+        self.assertEqual(res, expected_res)
+
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_generate_until(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test generate_until
+        requests = [
+            Instance(
+                request_type="generate_until",
+                doc={"input": doc},
+                arguments=(doc, {"until": stop}),
+                idx=i,
+            )
+            for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
+        ]
+
+        res = lm.generate_until(requests)
+
+        # Assert the generate_until response is correct
+        expected_res = ["generated text until stop1", "generated text until stop2"]
+        self.assertEqual(res, expected_res)
+
+    # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    # def test_loglikelihood_rolling(self, gguf_completion_mock):
+    #     lm = GGUFLM(base_url)
+
+    #     # Test loglikelihood_rolling
+    #     requests = ["input1", "input2"]
+    #     res = lm.loglikelihood_rolling(requests)
+
+    #     # Assert the loglikelihood_rolling response is correct
+    #     expected_res = [(-1.2345, True), (-1.2345, True)]
+    #     self.assertEqual(res, expected_res)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py b/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f495402397a7b336384bb8d2ffe8fd24ddce706
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from lm_eval import tasks
+from lm_eval.api.instance import Instance
+from lm_eval.models.huggingface import HFLM
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+task_manager = tasks.TaskManager()
+
+TEST_STRING = "foo bar"
+
+
+class Test_HFLM:
+    torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    version_minor = sys.version_info.minor
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
+    generate_until_task = task_list["gsm8k"]  # type: ignore
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.set_fewshot_seed(1234)  # fewshot random generator seed
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until: list[Instance] = generate_until_task.instances
+    rolling_task = task_list["wikitext"]  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: list[Instance] = rolling_task.instances
+
+    MULTIPLE_CH_RES = [
+        -41.902435302734375,
+        -42.939308166503906,
+        -33.914180755615234,
+        -37.07139205932617,
+        -22.95258331298828,
+        -20.342208862304688,
+        -14.818366050720215,
+        -27.942853927612305,
+        -15.80704116821289,
+        -15.936427116394043,
+        -13.052018165588379,
+        -18.04828453063965,
+        -13.345029830932617,
+        -13.366025924682617,
+        -12.127134323120117,
+        -11.872495651245117,
+        -47.10598373413086,
+        -47.76410675048828,
+        -36.4406852722168,
+        -50.0289421081543,
+        -16.72093963623047,
+        -18.535587310791016,
+        -26.46993637084961,
+        -20.355995178222656,
+        -17.757919311523438,
+        -21.80595588684082,
+        -33.1990852355957,
+        -39.28636932373047,
+        -14.759679794311523,
+        -16.753942489624023,
+        -11.486852645874023,
+        -15.42177677154541,
+        -13.15798282623291,
+        -15.887393951416016,
+        -15.28614616394043,
+        -12.339089393615723,
+        -44.59441375732422,
+        -55.40888214111328,
+        -52.70050811767578,
+        -56.25089645385742,
+    ]
+    generate_until_RES = [
+        " The average of $2.50 each is $",
+        " A robe takes 2 bolts of blue fiber and half",
+        " $50,000 in repairs.\n\nQuestion",
+        " He runs 1 sprint 3 times a week.",
+        " They feed each of her chickens three cups of mixed",
+        " The price of the glasses is $5, but",
+        " The total percentage of students who said they like to",
+        " Carla is downloading a 200 GB file. Normally",
+        " John drives for 3 hours at a speed of 60",
+        " Eliza sells 4 tickets to 5 friends so she",
+    ]
+    ROLLING_RES = [
+        -3603.6328125,
+        -19779.23974609375,
+        -8834.16455078125,
+        -27967.591796875,
+        -7636.794982910156,
+        -9491.93505859375,
+        -41043.4248046875,
+        -8397.689819335938,
+        -45969.47155761719,
+        -7158.90625,
+    ]
+    LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
+
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
+        # log samples to CI
+        dir_path = Path("test_logs")
+        dir_path.mkdir(parents=True, exist_ok=True)
+
+        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
+        file_path = file_path.resolve()
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(str(x) for x in _res))
+        assert np.allclose(_res, _RES, atol=1e-2)
+        # check indices for Multiple Choice
+        argmax_RES, argmax_res = (
+            np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
+            np.argmax(np.array(_res).reshape(-1, 4), axis=1),
+        )
+        assert (argmax_RES == argmax_res).all()
+
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert res == self.generate_until_RES
+
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
+
+    def test_toc_encode(self) -> None:
+        res = self.LM.tok_encode(TEST_STRING)
+        assert res == [12110, 2534]
+
+    def test_toc_decode(self) -> None:
+        res = self.LM.tok_decode([12110, 2534])
+        assert res == TEST_STRING
+
+    def test_batch_encode(self) -> None:
+        res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
+        assert res == [[12110, 2534], [2009, 17374]]
+
+    def test_model_generate(self) -> None:
+        context = self.LM.tok_batch_encode([TEST_STRING])[0]
+        res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
+        res = self.LM.tok_decode(res[0])
+        assert res == "foo bar\n<bazhang>!info bar"
diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py b/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py
new file mode 100644
index 0000000000000000000000000000000000000000..564d52303968e210439a7931f012487a959a367f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py
@@ -0,0 +1,26 @@
+import pytest
+import torch
+
+from lm_eval.models.neuron_optimum import wrap_constant_batch_size
+
+
+def test_wrap_constant_batch_size():
+    class Tester:
+        def __init__(self, batch_size):
+            self.batch_size = batch_size
+
+        @wrap_constant_batch_size
+        def test_constant_batch_size(self, inputs):
+            assert len(inputs) == self.batch_size
+            return inputs
+
+    batch_size_test = 8
+    for i in range(1, batch_size_test + 1):
+        tensor = torch.ones([i, 2, 2])
+        out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
+        torch.testing.assert_allclose(out, tensor)
+
+    with pytest.raises(ValueError):
+        Tester(batch_size=batch_size_test).test_constant_batch_size(
+            torch.ones([batch_size_test + 1, 2, 2])
+        )
diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py b/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f13cd9adb3d3850a28055c9a6daf43d40e3874
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py
@@ -0,0 +1,92 @@
+import random
+import tempfile
+from pathlib import Path
+
+import pytest
+from optimum.intel import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+from lm_eval import evaluator
+from lm_eval.api.registry import get_model
+
+
+SUPPORTED_ARCHITECTURES_TASKS = {
+    "facebook/opt-125m": "lambada_openai",
+    "hf-internal-testing/tiny-random-gpt2": "wikitext",
+}
+
+
+@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
+def test_evaluator(model_id, task):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model = OVModelForCausalLM.from_pretrained(
+            model_id, export=True, use_cache=True
+        )
+        model.save_pretrained(tmpdirname)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tmpdirname)
+
+        lm = get_model("openvino").create_from_arg_string(
+            f"pretrained={tmpdirname}",
+            {
+                "batch_size": 1,
+                "device": "cpu",
+            },
+        )
+
+        def ll_fn(reqs):
+            for ctx, cont in [req.args for req in reqs]:
+                if len(ctx) == 0:
+                    continue
+                # space convention
+                assert ctx[-1] != " "
+                assert cont[0] == " " or ctx[-1] == "\n"
+
+            res = []
+
+            random.seed(42)
+            for _ in reqs:
+                res.extend([(-random.random(), False)])
+
+            return res
+
+        def ll_perp_fn(reqs):
+            for (string,) in [req.args for req in reqs]:
+                assert isinstance(string, str)
+
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.extend([-random.random()])
+
+            return res
+
+        lm.loglikelihood = ll_fn
+        lm.loglikelihood_rolling = ll_perp_fn
+
+        limit = 10
+        evaluator.simple_evaluate(
+            model=lm,
+            tasks=[task],
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+        )
+
+
+def test_ov_config():
+    """Test that if specified, a custom OpenVINO config is loaded correctly"""
+    model_id = "hf-internal-testing/tiny-random-gpt2"
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        config_file = str(Path(tmpdirname) / "ov_config.json")
+        with open(Path(config_file), "w", encoding="utf-8") as f:
+            f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
+        lm = get_model("openvino").create_from_arg_string(
+            f"pretrained={model_id},ov_config={config_file}"
+        )
+    assert (
+        lm.model.request.get_compiled_model().get_property(
+            "DYNAMIC_QUANTIZATION_GROUP_SIZE"
+        )
+        == 32
+    )
diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py b/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..01363bc8dc31b43549f62120a8ce9fde0788b144
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py
@@ -0,0 +1,50 @@
+from typing import List
+
+import pytest
+
+from lm_eval import tasks
+from lm_eval.api.instance import Instance
+
+
+task_manager = tasks.TaskManager()
+
+
+@pytest.mark.skip(reason="requires CUDA")
+class Test_VLLM:
+    vllm = pytest.importorskip("vllm")
+    try:
+        from lm_eval.models.vllm_causallms import VLLM
+
+        LM = VLLM(pretrained="EleutherAI/pythia-70m")
+    except ModuleNotFoundError:
+        pass
+    # torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
+    generate_until_task = task_list["gsm8k"]  # type: ignore
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until: List[Instance] = generate_until_task.instances
+    rolling_task = task_list["wikitext"]  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: List[Instance] = rolling_task.instances
+
+    # TODO: make proper tests
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        assert len(res) == len(self.MULTIPLE_CH)
+        for x in res:
+            assert isinstance(x[0], float)
+
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert len(res) == len(self.generate_until)
+        for x in res:
+            assert isinstance(x, str)
+
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        for x in res:
+            assert isinstance(x, float)
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..53b28b5b86050168e13400d47dbf169de133d035
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood
@@ -0,0 +1 @@
+8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..d76cc9bdf55935bc1bc4e71d35267cb58ec618ef
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json
@@ -0,0 +1 @@
+{"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..154cf9c5946ed829ce7e2f173a2b03554fe789a1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood
@@ -0,0 +1 @@
+d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..f808af460570411d4616b6187dd67fa2ddd6ecee
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
@@ -0,0 +1 @@
+7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..276f03f76d1f76f242415e9cdeabf368c9a0f8ce
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..66b30be1b864c277e52541b2bd54cda1eb51d4a0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..1005f68060123bf94b6bf001f9284a7070a64258
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
@@ -0,0 +1 @@
+0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..b494980087dc4ac33621cca2fe716f1fee83fbd1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood
@@ -0,0 +1 @@
+3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f6b991cfefa168f678db82904660157cdc27
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
@@ -0,0 +1 @@
+63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..187b79e94c9ec4c378da110948775afc8be14920
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
@@ -0,0 +1 @@
+9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..43fadc2e0b0ea5cd762868a13629b85daec7f499
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e7d8db1e2ad279ed4bfcc094253f1fa7723b6ce
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d50683774ecacf772eaf6287328994d4abc0a98
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..14c1bf5f5ee1300b8652f6a73185badea754ec73
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood
@@ -0,0 +1 @@
+de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..45737909e7c21c528a647e91cceca3d2534869fc
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood
@@ -0,0 +1 @@
+e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..7941ad62997cb5129be9390a727352b689f807ae
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json
@@ -0,0 +1 @@
+{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb2b8b79ac3d37913992a56e688ea80d24c0af9e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_french_disability": {"likelihood_difference": 0.31387939561315326, "likelihood_difference_stderr": 0.027598132299657168, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758}}, "versions": {"crows_pairs_french_disability": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..c1713a5a881657c9ae4417f6adcf7480491a2915
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood
@@ -0,0 +1 @@
+010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..e6e282414b02b032bd5b879775686c24e731fd9d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood
@@ -0,0 +1 @@
+146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9dd321f7f3c9525491145df99fb4f7658be8065
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_french_nationality": {"likelihood_difference": 0.33534193269044926, "likelihood_difference_stderr": 0.01429836309463257, "pct_stereotype": 0.4743083003952569, "pct_stereotype_stderr": 0.031455431847992904}}, "versions": {"crows_pairs_french_nationality": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..6470b349d2e2a54c1ab113346885eb97c045a0ed
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until
@@ -0,0 +1 @@
+ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..39efbc506acfd90b86362185d28d43090aeb7d1c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json
@@ -0,0 +1 @@
+{"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f468bb46d3da891a285c615b25de9b2d99a7fd8d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4f122bfaa24901cff1ee686da0cf49ade7b6877c31a3daeb32c8cf2e328a77e
+size 153
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f27281ef4ea5c0438cbc9bff8ffdbc40a2c847f8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d531b0854314516cad7d56c7e28a694bf23072429147b235e9c6534492867bb2
+size 2984
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..11f07878fb5452ac334eaf0daf276aa8684124f6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood
@@ -0,0 +1 @@
+09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ac5a9c0b8e70a47f2c985713a50336c68b11382
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..8ecb637cfe4eaf6d3bbca863c7bab6188b85425b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
@@ -0,0 +1 @@
+bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..596bb28a93f52c857b6a39d416114c12c7ea9147
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d0ca97d6a58d8dae225d36636ef21b0fd1e50fdf
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
@@ -0,0 +1 @@
+f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..a421564657975a25dedfd1c8cf38ef0e0ea4df9c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
@@ -0,0 +1 @@
+870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..eec5858ef9a22ba66ee0627646b5ce98f2b0326d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
@@ -0,0 +1 @@
+d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb3a3ec0688cdc4905ffad6e17c91d59c9330572
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..8a915ef7fc0ab9a7c290867450265a7cadd40494
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
@@ -0,0 +1 @@
+33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..e05b91503e0a2c2c8bb8ef34af16e87c902c31f9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
@@ -0,0 +1 @@
+8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..3d55d21e0294d78ebb728920d0651ccf6f9150b7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
@@ -0,0 +1 @@
+cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5cf5cb467d80051cea569ab30ccc20d697e1e57
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..eac53bcf4a7610934b697d6d19f53ecdf5d4a4ad
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..3ea8ef0a0e3ddf5cc42c6305e1885e163399f38c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
@@ -0,0 +1 @@
+a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..118c9b7435cc72387017ba1811d4bb62a16846b5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
@@ -0,0 +1 @@
+97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..fc59546576857b7f52dd4bfbdfc661c8ce871a6a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until
@@ -0,0 +1 @@
+b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..3c444f66611959e4c13451d306fba403261ecfbb
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood
@@ -0,0 +1 @@
+5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b652210ae3df4694785f6bfe6543909435122dee
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..efd450a8f2a4ca067f7380af809fdda48d1ee465
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood
@@ -0,0 +1 @@
+6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a52f2a9f1c83bcc119c95c05394f1bd2a86bf888
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_openai_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_cloze": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..ce881a0232cff3f1025b746184ce8a0170e34303
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until
@@ -0,0 +1 @@
+f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..82febb9f5dfeefbd6dc5d244574ac5666c6b8bba
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until
@@ -0,0 +1 @@
+b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5846590a3b28f2382d00a3400e1c46a9018adea
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json
@@ -0,0 +1 @@
+{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..433b76d01094a18991412513044f0933eb0bf3f5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood
@@ -0,0 +1 @@
+4fc7b56b8f1e37e38f4a052b227baec2df914c898c3405d3e994726ba4fba976
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..95c849a153762819da1ce59c1b58a2013b97ef6a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood
@@ -0,0 +1 @@
+9f54cbff8d6accba99cfa2c4c4b359563313941018173d7dcf9e32dc28c06583
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..42e97c6f1a1b65ef76cc3941c8b08e8ca836a59c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json
@@ -0,0 +1 @@
+{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22460496613995484, "r@1_stderr": 0.014028122493992806, "r@2": 0.4706546275395034, "r@2_stderr": 0.016778343895001414}}, "versions": {"mutual": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..f4ba9d37310a19cc7928fd0d599776d8a9da8dba
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood
@@ -0,0 +1 @@
+b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..57dbe764605ef5e1e4578682549a001c851704c0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..57dbe764605ef5e1e4578682549a001c851704c0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..bdabf399695d155026643eacca7954c5f87009d5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json
@@ -0,0 +1 @@
+{"results": {"pile_github": {"bits_per_byte": 9.540627613754646e-05, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..48b767bfe706bb035e4553ea9c4119347303bab9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..5f76588a813eebe7f0958a07253480d30de2ccf3
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1468294732b13576161fc3824a479028d5bdb0ba
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_opensubtitles": {"bits_per_byte": 2.1948356082685497e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a2f77678abc264edf433a4eb98da08fc20b1dfc
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_philpapers": {"bits_per_byte": 9.004690592465457e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..de5660d60a8d4f0d5e35d47008992befed318d28
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..ce041998635643ee17aace3105b227ef0746917e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..b01b1fe5d8c699f855bff57061d6d63715c7f058
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood
@@ -0,0 +1 @@
+6048a3a2bb3ad1e6a3d98139618e06b4d7de766edd685bd38837596199c3f69f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..bb39463a4ab7244109901cbbc06ded3192ee0480
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json
@@ -0,0 +1 @@
+{"results": {"pubmedqa": {"acc": 0.324, "acc_stderr": 0.01480686473373886}}, "versions": {"pubmedqa": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c87e487e9ac147c5a9ba8cb3a4b2a39048d1dcaa
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json
@@ -0,0 +1 @@
+{"results": {"qa4mre_2013": {"acc": 0.18309859154929578, "acc_norm": 0.22183098591549297, "acc_norm_stderr": 0.02469760575535269, "acc_stderr": 0.022989742475464973}}, "versions": {"qa4mre_2013": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..006c381372178097b36bfac48795e6fbdc242b1a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json
@@ -0,0 +1 @@
+{"results": {"record": {"em": 0.1521, "em_stderr": 0.0035913575128186616, "f1": 0.1581870634920636, "f1_stderr": 0.0036146895141474576}}, "versions": {"record": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..7b02bf1e3c27ff9bb640fe9dc91423dc62db7a90
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e93fb2906d04f52b903c5e359f942fb6b0d70ab7be0254409f3be70e3cd45dd0
+size 2467
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..39eba3fbdf7865fba5e076bc6c6b85418ef0c04a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc36e5e5b1b97b2c588905c100bb7cbb5c262d1ed991ae182fe25cf8a88b61dd
+size 1960
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1b1854c2e0abc9c4fe8096b4d45004bcc1a381b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json
@@ -0,0 +1 @@
+{"results": {"truthfulqa_mc": {"mc1": 0.23255813953488372, "mc1_stderr": 0.01478915753108052, "mc2": 0.4462325560722362, "mc2_stderr": 0.004986523944692003}}, "versions": {"truthfulqa_mc": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..eadc573ed3d6a9b8b9bd924896ef5d719a53d5d1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wic": {"acc": 0.49216300940438873, "acc_stderr": 0.01980828765781383}}, "versions": {"wic": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fa7903a56d2fb48abcd215bb587bc69c00f4aa6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json
@@ -0,0 +1 @@
+{"results": {"winogrande": {"acc": 0.516179952644041, "acc_stderr": 0.014045126130978606}}, "versions": {"winogrande": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..bd431d61c479beb686d39be21905fdb0beb7781e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until
@@ -0,0 +1 @@
+952f02575d4936d93c4d2808d86c4bf5f1f3a0901212acee6cbc1f9cbd30d39e
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..e94cac8876d9d04c883b5ad5810884af7faa436c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt20-iu-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012204628007572778, "chrf_stderr": 8.944407532175802e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-iu-en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6d0c71ad7c140790040022c51970dea075ab0b1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt20-ru-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013344639906399232, "chrf_stderr": 7.583552652374546e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ru-en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml b/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e4367eeb3651bd49540009949fc080b9dba0a59
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml
@@ -0,0 +1,45 @@
+group: test-1
+group_alias: test 1
+task:
+  - piqa # string task
+  - ai2_arc # string tag
+  - task: super-glue-lm-eval-v1 # Should this be spread out?
+    num_fewshot: 3
+  - task: swag # dict registered task
+    num_fewshot: 2
+  - task: mmlu
+    num_fewshot: 5
+  - group: nli-tasks # dict group
+    task:
+      - anli
+      - boolq
+      - sglue_rte
+    num_fewshot: 4
+    metric_list:
+      - metric: brier_score
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 2-shot
+    num_fewshot: 2
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 4-shot
+    num_fewshot: 4
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 6-shot
+    num_fewshot: 6
+  - task: siqa_custom # dict task
+    dataset_path: social_i_qa
+    dataset_name: null
+    output_type: multiple_choice
+    training_split: train
+    validation_split: validation
+    doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
+    target_delimiter: " "
+    doc_to_choice:
+      - "{{answerA}}"
+      - "{{answerB}}"
+      - "{{answerC}}"
+    doc_to_target: "{{ (label|int) - 1 }}"
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true