diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_api.py b/scripts/yans/lm-evaluation-harness/tests/models/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..1bca2f7bdbc479d6c8c45171347f11dd8c8892d9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_api.py @@ -0,0 +1,149 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from lm_eval.models.openai_completions import LocalCompletionsAPI + + +@pytest.fixture +def api(): + return LocalCompletionsAPI( + base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo" + ) + + +@pytest.fixture +def api_tokenized(): + return LocalCompletionsAPI( + base_url="http://test-url.com", + model="EleutherAI/pythia-1b", + tokenizer_backend="huggingface", + ) + + +def test_create_payload_generate(api): + messages = ["Generate a story"] + gen_kwargs = { + "max_tokens": 100, + "temperature": 0.7, + "until": ["The End"], + "do_sample": True, + "seed": 1234, + } + payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs) + + assert payload == { + "prompt": ["Generate a story"], + "model": "gpt-3.5-turbo", + "max_tokens": 100, + "temperature": 0.7, + "stop": ["The End"], + "seed": 1234, + } + + +def test_create_payload_loglikelihood(api): + messages = ["The capital of France is"] + payload = api._create_payload(messages, generate=False, gen_kwargs=None) + + assert payload == { + "model": "gpt-3.5-turbo", + "prompt": ["The capital of France is"], + "max_tokens": 1, + "logprobs": 1, + "echo": True, + "temperature": 0, + "seed": 1234, + } + + +@pytest.mark.parametrize( + "input_messages, generate, gen_kwargs, expected_payload", + [ + ( + ["Hello, how are"], + True, + {"max_gen_toks": 100, "temperature": 0.7}, + { + "prompt": "Hello, how are", + "model": "gpt-3.5-turbo", + "max_tokens": 100, + "temperature": 0.7, + "stop": ["<|endoftext|>"], + "seed": 1234, + }, + ), + ( + ["Hello, how are", "you"], + True, + {}, + { + "prompt": "Hello, how are", + "model": "gpt-3.5-turbo", + "max_tokens": 256, + "temperature": 0, + "stop": ["<|endoftext|>"], + "seed": 1234, + }, + ), + ], +) +def test_model_generate_call_usage( + api, input_messages, generate, gen_kwargs, expected_payload +): + with patch("requests.post") as mock_post: + mock_response = MagicMock() + mock_response.json.return_value = {"result": "success"} + mock_post.return_value = mock_response + + # Act + result = api.model_call( + input_messages, generate=generate, gen_kwargs=gen_kwargs + ) + + # Assert + mock_post.assert_called_once() + _, kwargs = mock_post.call_args + assert "json" in kwargs + assert kwargs["json"] == expected_payload + assert result == {"result": "success"} + + +@pytest.mark.parametrize( + "input_messages, generate, gen_kwargs, expected_payload", + [ + ( + [[1, 2, 3, 4, 5]], + False, + None, + { + "model": "EleutherAI/pythia-1b", + "prompt": [[1, 2, 3, 4, 5]], + "max_tokens": 1, + "logprobs": 1, + "echo": True, + "seed": 1234, + "temperature": 0, + }, + ), + ], +) +def test_model_tokenized_call_usage( + api_tokenized, input_messages, generate, gen_kwargs, expected_payload +): + with patch("requests.post") as mock_post: + mock_response = MagicMock() + mock_response.json.return_value = {"result": "success"} + mock_post.return_value = mock_response + + # Act + result = api_tokenized.model_call( + input_messages, generate=generate, gen_kwargs=gen_kwargs + ) + + # Assert + mock_post.assert_called_once() + _, kwargs = mock_post.call_args + assert "json" in kwargs + assert kwargs["json"] == expected_payload + assert result == {"result": "success"} diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py b/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e197e77418fe83e3cc1cf96e23223b80afe633 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py @@ -0,0 +1,152 @@ +import hashlib +import json +import os +import pickle +import unittest +from unittest.mock import patch + +from lm_eval.api.instance import Instance +from lm_eval.models.gguf import GGUFLM + + +base_url = "https://matthoffner-ggml-llm-api.hf.space" + + +def gguf_completion_mock(base_url=None, **kwargs): + # Generate a hash from the parameters + hash_kwargs = {"base_url": base_url, **kwargs} + parameters_hash = hashlib.sha256( + json.dumps(hash_kwargs, sort_keys=True).encode("utf-8") + ).hexdigest() + + fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl" + + if os.path.exists(fname): + with open(fname, "rb") as fh: + return pickle.load(fh) + else: + print("The file does not exist, attempting to write...") + if "stop" in kwargs: + result = { + "choices": [ + { + "text": f"generated text until {kwargs['stop']}", + "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0}, + "finish_reason": "length", + } + ] + } + else: + # generated with # curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}' + result = { + "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3", + "object": "text_completion", + "created": 1700511361, + "model": "../llama-2-7b.Q8_0.gguf", + "choices": [ + { + "text": "string(", + "index": 0, + "logprobs": { + "text_offset": [0, 7], + "token_logprobs": [None, -1.033263319857306], + "tokens": [" string", "("], + "top_logprobs": [ + None, + { + "(": -1.033263319857306, + "[]": -2.6530743779017394, + ".": -3.0377145947291324, + "\n": -3.0399156750513976, + "_": -3.510376089937872, + " =": -3.6957918347193663, + ",": -3.9309459866358702, + " of": -4.2834550083949035, + '("': -4.322762841112799, + "()": -4.426229113466925, + }, + ], + }, + "finish_reason": "length", + } + ], + "usage": { + "prompt_tokens": 2, + "completion_tokens": 1, + "total_tokens": 3, + }, + } + + try: + os.makedirs(os.path.dirname(fname), exist_ok=True) + print("Writing file at", fname) + with open(fname, "wb") as fh: + pickle.dump(result, fh) + print("File written successfully") + except Exception as e: + print("File writing failed:", e) + + return result + + +class GGUFLMTest(unittest.TestCase): + @patch( + "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock + ) + def test_loglikelihood(self, gguf_completion_mock): + lm = GGUFLM(base_url) + + # Test loglikelihood + requests = [ + Instance( + request_type="loglikelihood", + doc=args, + arguments=args, + idx=i, + ) + for i, args in enumerate([("str", "ing"), ("str", "ing")]) + ] + res = lm.loglikelihood(requests) + + # Assert the loglikelihood response is correct + expected_res = [(logprob, True) for logprob in [0, 0]] + self.assertEqual(res, expected_res) + + @patch( + "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock + ) + def test_generate_until(self, gguf_completion_mock): + lm = GGUFLM(base_url) + + # Test generate_until + requests = [ + Instance( + request_type="generate_until", + doc={"input": doc}, + arguments=(doc, {"until": stop}), + idx=i, + ) + for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")]) + ] + + res = lm.generate_until(requests) + + # Assert the generate_until response is correct + expected_res = ["generated text until stop1", "generated text until stop2"] + self.assertEqual(res, expected_res) + + # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock) + # def test_loglikelihood_rolling(self, gguf_completion_mock): + # lm = GGUFLM(base_url) + + # # Test loglikelihood_rolling + # requests = ["input1", "input2"] + # res = lm.loglikelihood_rolling(requests) + + # # Assert the loglikelihood_rolling response is correct + # expected_res = [(-1.2345, True), (-1.2345, True)] + # self.assertEqual(res, expected_res) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py b/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py new file mode 100644 index 0000000000000000000000000000000000000000..9f495402397a7b336384bb8d2ffe8fd24ddce706 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import numpy as np +import torch + +from lm_eval import tasks +from lm_eval.api.instance import Instance +from lm_eval.models.huggingface import HFLM + + +os.environ["TOKENIZERS_PARALLELISM"] = "false" +task_manager = tasks.TaskManager() + +TEST_STRING = "foo bar" + + +class Test_HFLM: + torch.use_deterministic_algorithms(True) + task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"]) + version_minor = sys.version_info.minor + multiple_choice_task = task_list["arc_easy"] # type: ignore + multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) + MULTIPLE_CH: list[Instance] = multiple_choice_task.instances + generate_until_task = task_list["gsm8k"] # type: ignore + generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 + generate_until_task.set_fewshot_seed(1234) # fewshot random generator seed + generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) + generate_until: list[Instance] = generate_until_task.instances + rolling_task = task_list["wikitext"] # type: ignore + rolling_task.build_all_requests(limit=10, rank=0, world_size=1) + ROLLING: list[Instance] = rolling_task.instances + + MULTIPLE_CH_RES = [ + -41.902435302734375, + -42.939308166503906, + -33.914180755615234, + -37.07139205932617, + -22.95258331298828, + -20.342208862304688, + -14.818366050720215, + -27.942853927612305, + -15.80704116821289, + -15.936427116394043, + -13.052018165588379, + -18.04828453063965, + -13.345029830932617, + -13.366025924682617, + -12.127134323120117, + -11.872495651245117, + -47.10598373413086, + -47.76410675048828, + -36.4406852722168, + -50.0289421081543, + -16.72093963623047, + -18.535587310791016, + -26.46993637084961, + -20.355995178222656, + -17.757919311523438, + -21.80595588684082, + -33.1990852355957, + -39.28636932373047, + -14.759679794311523, + -16.753942489624023, + -11.486852645874023, + -15.42177677154541, + -13.15798282623291, + -15.887393951416016, + -15.28614616394043, + -12.339089393615723, + -44.59441375732422, + -55.40888214111328, + -52.70050811767578, + -56.25089645385742, + ] + generate_until_RES = [ + " The average of $2.50 each is $", + " A robe takes 2 bolts of blue fiber and half", + " $50,000 in repairs.\n\nQuestion", + " He runs 1 sprint 3 times a week.", + " They feed each of her chickens three cups of mixed", + " The price of the glasses is $5, but", + " The total percentage of students who said they like to", + " Carla is downloading a 200 GB file. Normally", + " John drives for 3 hours at a speed of 60", + " Eliza sells 4 tickets to 5 friends so she", + ] + ROLLING_RES = [ + -3603.6328125, + -19779.23974609375, + -8834.16455078125, + -27967.591796875, + -7636.794982910156, + -9491.93505859375, + -41043.4248046875, + -8397.689819335938, + -45969.47155761719, + -7158.90625, + ] + LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32") + + def test_logliklihood(self) -> None: + res = self.LM.loglikelihood(self.MULTIPLE_CH) + _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res] + # log samples to CI + dir_path = Path("test_logs") + dir_path.mkdir(parents=True, exist_ok=True) + + file_path = dir_path / f"outputs_log_{self.version_minor}.txt" + file_path = file_path.resolve() + with open(file_path, "w", encoding="utf-8") as f: + f.write("\n".join(str(x) for x in _res)) + assert np.allclose(_res, _RES, atol=1e-2) + # check indices for Multiple Choice + argmax_RES, argmax_res = ( + np.argmax(np.array(_RES).reshape(-1, 4), axis=1), + np.argmax(np.array(_res).reshape(-1, 4), axis=1), + ) + assert (argmax_RES == argmax_res).all() + + def test_generate_until(self) -> None: + res = self.LM.generate_until(self.generate_until) + assert res == self.generate_until_RES + + def test_logliklihood_rolling(self) -> None: + res = self.LM.loglikelihood_rolling(self.ROLLING) + assert np.allclose(res, self.ROLLING_RES, atol=1e-1) + + def test_toc_encode(self) -> None: + res = self.LM.tok_encode(TEST_STRING) + assert res == [12110, 2534] + + def test_toc_decode(self) -> None: + res = self.LM.tok_decode([12110, 2534]) + assert res == TEST_STRING + + def test_batch_encode(self) -> None: + res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist() + assert res == [[12110, 2534], [2009, 17374]] + + def test_model_generate(self) -> None: + context = self.LM.tok_batch_encode([TEST_STRING])[0] + res = self.LM._model_generate(context, max_length=10, stop=["\n\n"]) + res = self.LM.tok_decode(res[0]) + assert res == "foo bar\n!info bar" diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py b/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py new file mode 100644 index 0000000000000000000000000000000000000000..564d52303968e210439a7931f012487a959a367f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py @@ -0,0 +1,26 @@ +import pytest +import torch + +from lm_eval.models.neuron_optimum import wrap_constant_batch_size + + +def test_wrap_constant_batch_size(): + class Tester: + def __init__(self, batch_size): + self.batch_size = batch_size + + @wrap_constant_batch_size + def test_constant_batch_size(self, inputs): + assert len(inputs) == self.batch_size + return inputs + + batch_size_test = 8 + for i in range(1, batch_size_test + 1): + tensor = torch.ones([i, 2, 2]) + out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor) + torch.testing.assert_allclose(out, tensor) + + with pytest.raises(ValueError): + Tester(batch_size=batch_size_test).test_constant_batch_size( + torch.ones([batch_size_test + 1, 2, 2]) + ) diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py b/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py new file mode 100644 index 0000000000000000000000000000000000000000..b8f13cd9adb3d3850a28055c9a6daf43d40e3874 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py @@ -0,0 +1,92 @@ +import random +import tempfile +from pathlib import Path + +import pytest +from optimum.intel import OVModelForCausalLM +from transformers import AutoTokenizer + +from lm_eval import evaluator +from lm_eval.api.registry import get_model + + +SUPPORTED_ARCHITECTURES_TASKS = { + "facebook/opt-125m": "lambada_openai", + "hf-internal-testing/tiny-random-gpt2": "wikitext", +} + + +@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items()) +def test_evaluator(model_id, task): + with tempfile.TemporaryDirectory() as tmpdirname: + model = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=True + ) + model.save_pretrained(tmpdirname) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.save_pretrained(tmpdirname) + + lm = get_model("openvino").create_from_arg_string( + f"pretrained={tmpdirname}", + { + "batch_size": 1, + "device": "cpu", + }, + ) + + def ll_fn(reqs): + for ctx, cont in [req.args for req in reqs]: + if len(ctx) == 0: + continue + # space convention + assert ctx[-1] != " " + assert cont[0] == " " or ctx[-1] == "\n" + + res = [] + + random.seed(42) + for _ in reqs: + res.extend([(-random.random(), False)]) + + return res + + def ll_perp_fn(reqs): + for (string,) in [req.args for req in reqs]: + assert isinstance(string, str) + + res = [] + random.seed(42) + for _ in reqs: + res.extend([-random.random()]) + + return res + + lm.loglikelihood = ll_fn + lm.loglikelihood_rolling = ll_perp_fn + + limit = 10 + evaluator.simple_evaluate( + model=lm, + tasks=[task], + num_fewshot=0, + limit=limit, + bootstrap_iters=10, + ) + + +def test_ov_config(): + """Test that if specified, a custom OpenVINO config is loaded correctly""" + model_id = "hf-internal-testing/tiny-random-gpt2" + with tempfile.TemporaryDirectory() as tmpdirname: + config_file = str(Path(tmpdirname) / "ov_config.json") + with open(Path(config_file), "w", encoding="utf-8") as f: + f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}') + lm = get_model("openvino").create_from_arg_string( + f"pretrained={model_id},ov_config={config_file}" + ) + assert ( + lm.model.request.get_compiled_model().get_property( + "DYNAMIC_QUANTIZATION_GROUP_SIZE" + ) + == 32 + ) diff --git a/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py b/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py new file mode 100644 index 0000000000000000000000000000000000000000..01363bc8dc31b43549f62120a8ce9fde0788b144 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py @@ -0,0 +1,50 @@ +from typing import List + +import pytest + +from lm_eval import tasks +from lm_eval.api.instance import Instance + + +task_manager = tasks.TaskManager() + + +@pytest.mark.skip(reason="requires CUDA") +class Test_VLLM: + vllm = pytest.importorskip("vllm") + try: + from lm_eval.models.vllm_causallms import VLLM + + LM = VLLM(pretrained="EleutherAI/pythia-70m") + except ModuleNotFoundError: + pass + # torch.use_deterministic_algorithms(True) + task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"]) + multiple_choice_task = task_list["arc_easy"] # type: ignore + multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) + MULTIPLE_CH: List[Instance] = multiple_choice_task.instances + generate_until_task = task_list["gsm8k"] # type: ignore + generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 + generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) + generate_until: List[Instance] = generate_until_task.instances + rolling_task = task_list["wikitext"] # type: ignore + rolling_task.build_all_requests(limit=10, rank=0, world_size=1) + ROLLING: List[Instance] = rolling_task.instances + + # TODO: make proper tests + def test_logliklihood(self) -> None: + res = self.LM.loglikelihood(self.MULTIPLE_CH) + assert len(res) == len(self.MULTIPLE_CH) + for x in res: + assert isinstance(x[0], float) + + def test_generate_until(self) -> None: + res = self.LM.generate_until(self.generate_until) + assert len(res) == len(self.generate_until) + for x in res: + assert isinstance(x, str) + + def test_logliklihood_rolling(self) -> None: + res = self.LM.loglikelihood_rolling(self.ROLLING) + for x in res: + assert isinstance(x, float) diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..53b28b5b86050168e13400d47dbf169de133d035 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood @@ -0,0 +1 @@ +8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..d76cc9bdf55935bc1bc4e71d35267cb58ec618ef --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json @@ -0,0 +1 @@ +{"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..154cf9c5946ed829ce7e2f173a2b03554fe789a1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood @@ -0,0 +1 @@ +d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..f808af460570411d4616b6187dd67fa2ddd6ecee --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood @@ -0,0 +1 @@ +7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..276f03f76d1f76f242415e9cdeabf368c9a0f8ce --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..66b30be1b864c277e52541b2bd54cda1eb51d4a0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..1005f68060123bf94b6bf001f9284a7070a64258 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood @@ -0,0 +1 @@ +0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..b494980087dc4ac33621cca2fe716f1fee83fbd1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood @@ -0,0 +1 @@ +3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f6b991cfefa168f678db82904660157cdc27 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood @@ -0,0 +1 @@ +63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..187b79e94c9ec4c378da110948775afc8be14920 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood @@ -0,0 +1 @@ +9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..43fadc2e0b0ea5cd762868a13629b85daec7f499 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..0e7d8db1e2ad279ed4bfcc094253f1fa7723b6ce --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..1d50683774ecacf772eaf6287328994d4abc0a98 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..14c1bf5f5ee1300b8652f6a73185badea754ec73 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood @@ -0,0 +1 @@ +de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..45737909e7c21c528a647e91cceca3d2534869fc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood @@ -0,0 +1 @@ +e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..7941ad62997cb5129be9390a727352b689f807ae --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json @@ -0,0 +1 @@ +{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..cb2b8b79ac3d37913992a56e688ea80d24c0af9e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json @@ -0,0 +1 @@ +{"results": {"crows_pairs_french_disability": {"likelihood_difference": 0.31387939561315326, "likelihood_difference_stderr": 0.027598132299657168, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758}}, "versions": {"crows_pairs_french_disability": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..c1713a5a881657c9ae4417f6adcf7480491a2915 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood @@ -0,0 +1 @@ +010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..e6e282414b02b032bd5b879775686c24e731fd9d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood @@ -0,0 +1 @@ +146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..f9dd321f7f3c9525491145df99fb4f7658be8065 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json @@ -0,0 +1 @@ +{"results": {"crows_pairs_french_nationality": {"likelihood_difference": 0.33534193269044926, "likelihood_difference_stderr": 0.01429836309463257, "pct_stereotype": 0.4743083003952569, "pct_stereotype_stderr": 0.031455431847992904}}, "versions": {"crows_pairs_french_nationality": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..6470b349d2e2a54c1ab113346885eb97c045a0ed --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until @@ -0,0 +1 @@ +ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..39efbc506acfd90b86362185d28d43090aeb7d1c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json @@ -0,0 +1 @@ +{"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f468bb46d3da891a285c615b25de9b2d99a7fd8d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f122bfaa24901cff1ee686da0cf49ade7b6877c31a3daeb32c8cf2e328a77e +size 153 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f27281ef4ea5c0438cbc9bff8ffdbc40a2c847f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d531b0854314516cad7d56c7e28a694bf23072429147b235e9c6534492867bb2 +size 2984 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..11f07878fb5452ac334eaf0daf276aa8684124f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood @@ -0,0 +1 @@ +09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..6ac5a9c0b8e70a47f2c985713a50336c68b11382 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json @@ -0,0 +1 @@ +{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..8ecb637cfe4eaf6d3bbca863c7bab6188b85425b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood @@ -0,0 +1 @@ +bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..596bb28a93f52c857b6a39d416114c12c7ea9147 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..d0ca97d6a58d8dae225d36636ef21b0fd1e50fdf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood @@ -0,0 +1 @@ +f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a421564657975a25dedfd1c8cf38ef0e0ea4df9c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood @@ -0,0 +1 @@ +870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..eec5858ef9a22ba66ee0627646b5ce98f2b0326d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood @@ -0,0 +1 @@ +d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..cb3a3ec0688cdc4905ffad6e17c91d59c9330572 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..8a915ef7fc0ab9a7c290867450265a7cadd40494 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood @@ -0,0 +1 @@ +33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..e05b91503e0a2c2c8bb8ef34af16e87c902c31f9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood @@ -0,0 +1 @@ +8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..3d55d21e0294d78ebb728920d0651ccf6f9150b7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood @@ -0,0 +1 @@ +cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..c5cf5cb467d80051cea569ab30ccc20d697e1e57 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..eac53bcf4a7610934b697d6d19f53ecdf5d4a4ad --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..3ea8ef0a0e3ddf5cc42c6305e1885e163399f38c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood @@ -0,0 +1 @@ +a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..118c9b7435cc72387017ba1811d4bb62a16846b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood @@ -0,0 +1 @@ +97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..fc59546576857b7f52dd4bfbdfc661c8ce871a6a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until @@ -0,0 +1 @@ +b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..3c444f66611959e4c13451d306fba403261ecfbb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood @@ -0,0 +1 @@ +5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..b652210ae3df4694785f6bfe6543909435122dee --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..efd450a8f2a4ca067f7380af809fdda48d1ee465 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood @@ -0,0 +1 @@ +6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..a52f2a9f1c83bcc119c95c05394f1bd2a86bf888 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_openai_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_cloze": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..ce881a0232cff3f1025b746184ce8a0170e34303 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until @@ -0,0 +1 @@ +f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..82febb9f5dfeefbd6dc5d244574ac5666c6b8bba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until @@ -0,0 +1 @@ +b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..a5846590a3b28f2382d00a3400e1c46a9018adea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json @@ -0,0 +1 @@ +{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..433b76d01094a18991412513044f0933eb0bf3f5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood @@ -0,0 +1 @@ +4fc7b56b8f1e37e38f4a052b227baec2df914c898c3405d3e994726ba4fba976 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..95c849a153762819da1ce59c1b58a2013b97ef6a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood @@ -0,0 +1 @@ +9f54cbff8d6accba99cfa2c4c4b359563313941018173d7dcf9e32dc28c06583 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..42e97c6f1a1b65ef76cc3941c8b08e8ca836a59c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-res.json @@ -0,0 +1 @@ +{"results": {"mutual": {"mrr": 0.5023513920240772, "mrr_stderr": 0.009501864812936679, "r@1": 0.22460496613995484, "r@1_stderr": 0.014028122493992806, "r@2": 0.4706546275395034, "r@2_stderr": 0.016778343895001414}}, "versions": {"mutual": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..f4ba9d37310a19cc7928fd0d599776d8a9da8dba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood @@ -0,0 +1 @@ +b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..57dbe764605ef5e1e4578682549a001c851704c0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling @@ -0,0 +1 @@ +4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..57dbe764605ef5e1e4578682549a001c851704c0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling @@ -0,0 +1 @@ +4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..bdabf399695d155026643eacca7954c5f87009d5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_github": {"bits_per_byte": 9.540627613754646e-05, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..48b767bfe706bb035e4553ea9c4119347303bab9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling @@ -0,0 +1 @@ +ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..5f76588a813eebe7f0958a07253480d30de2ccf3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling @@ -0,0 +1 @@ +520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..1468294732b13576161fc3824a479028d5bdb0ba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-res.json @@ -0,0 +1 @@ +{"results": {"pile_opensubtitles": {"bits_per_byte": 2.1948356082685497e-05, "byte_perplexity": 1.0000152135568616, "word_perplexity": 1.0000856162053249}}, "versions": {"pile_opensubtitles": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5a2f77678abc264edf433a4eb98da08fc20b1dfc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-res.json @@ -0,0 +1 @@ +{"results": {"pile_philpapers": {"bits_per_byte": 9.004690592465457e-06, "byte_perplexity": 1.0000062415953748, "word_perplexity": 1.0000409888564146}}, "versions": {"pile_philpapers": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..de5660d60a8d4f0d5e35d47008992befed318d28 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling @@ -0,0 +1 @@ +66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..ce041998635643ee17aace3105b227ef0746917e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling @@ -0,0 +1 @@ +4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..b01b1fe5d8c699f855bff57061d6d63715c7f058 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood @@ -0,0 +1 @@ +6048a3a2bb3ad1e6a3d98139618e06b4d7de766edd685bd38837596199c3f69f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..bb39463a4ab7244109901cbbc06ded3192ee0480 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json @@ -0,0 +1 @@ +{"results": {"pubmedqa": {"acc": 0.324, "acc_stderr": 0.01480686473373886}}, "versions": {"pubmedqa": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..c87e487e9ac147c5a9ba8cb3a4b2a39048d1dcaa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-res.json @@ -0,0 +1 @@ +{"results": {"qa4mre_2013": {"acc": 0.18309859154929578, "acc_norm": 0.22183098591549297, "acc_norm_stderr": 0.02469760575535269, "acc_stderr": 0.022989742475464973}}, "versions": {"qa4mre_2013": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..006c381372178097b36bfac48795e6fbdc242b1a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-res.json @@ -0,0 +1 @@ +{"results": {"record": {"em": 0.1521, "em_stderr": 0.0035913575128186616, "f1": 0.1581870634920636, "f1_stderr": 0.0036146895141474576}}, "versions": {"record": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7b02bf1e3c27ff9bb640fe9dc91423dc62db7a90 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e93fb2906d04f52b903c5e359f942fb6b0d70ab7be0254409f3be70e3cd45dd0 +size 2467 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl new file mode 100644 index 0000000000000000000000000000000000000000..39eba3fbdf7865fba5e076bc6c6b85418ef0c04a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc36e5e5b1b97b2c588905c100bb7cbb5c262d1ed991ae182fe25cf8a88b61dd +size 1960 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..c1b1854c2e0abc9c4fe8096b4d45004bcc1a381b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-res.json @@ -0,0 +1 @@ +{"results": {"truthfulqa_mc": {"mc1": 0.23255813953488372, "mc1_stderr": 0.01478915753108052, "mc2": 0.4462325560722362, "mc2_stderr": 0.004986523944692003}}, "versions": {"truthfulqa_mc": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..eadc573ed3d6a9b8b9bd924896ef5d719a53d5d1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-res.json @@ -0,0 +1 @@ +{"results": {"wic": {"acc": 0.49216300940438873, "acc_stderr": 0.01980828765781383}}, "versions": {"wic": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..9fa7903a56d2fb48abcd215bb587bc69c00f4aa6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/winogrande-v0-res.json @@ -0,0 +1 @@ +{"results": {"winogrande": {"acc": 0.516179952644041, "acc_stderr": 0.014045126130978606}}, "versions": {"winogrande": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..bd431d61c479beb686d39be21905fdb0beb7781e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until @@ -0,0 +1 @@ +952f02575d4936d93c4d2808d86c4bf5f1f3a0901212acee6cbc1f9cbd30d39e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..e94cac8876d9d04c883b5ad5810884af7faa436c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-iu-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012204628007572778, "chrf_stderr": 8.944407532175802e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-iu-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..b6d0c71ad7c140790040022c51970dea075ab0b1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-ru-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013344639906399232, "chrf_stderr": 7.583552652374546e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ru-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml b/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e4367eeb3651bd49540009949fc080b9dba0a59 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testyamls/test-01.yaml @@ -0,0 +1,45 @@ +group: test-1 +group_alias: test 1 +task: + - piqa # string task + - ai2_arc # string tag + - task: super-glue-lm-eval-v1 # Should this be spread out? + num_fewshot: 3 + - task: swag # dict registered task + num_fewshot: 2 + - task: mmlu + num_fewshot: 5 + - group: nli-tasks # dict group + task: + - anli + - boolq + - sglue_rte + num_fewshot: 4 + metric_list: + - metric: brier_score + - task: sciq # dict registered task duplicate + task_alias: sciq 2-shot + num_fewshot: 2 + - task: sciq # dict registered task duplicate + task_alias: sciq 4-shot + num_fewshot: 4 + - task: sciq # dict registered task duplicate + task_alias: sciq 6-shot + num_fewshot: 6 + - task: siqa_custom # dict task + dataset_path: social_i_qa + dataset_name: null + output_type: multiple_choice + training_split: train + validation_split: validation + doc_to_text: "Question: {{context}} {{question}}\nAnswer:" + target_delimiter: " " + doc_to_choice: + - "{{answerA}}" + - "{{answerB}}" + - "{{answerC}}" + doc_to_target: "{{ (label|int) - 1 }}" + metric_list: + - metric: acc + aggregation: mean + higher_is_better: true