koichi12 commited on Nov 28, 2024

Commit

0ba7ae8

verified ·

1 Parent(s): 277ed5d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/yans/lm-evaluation-harness/tests/models/test_api.py +149 -0
scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py +152 -0
scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py +148 -0
scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py +26 -0
scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py +92 -0
scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py +50 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl +3 -0
scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl +3 -0
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json +1 -0

scripts/yans/lm-evaluation-harness/tests/models/test_api.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from unittest.mock import MagicMock, patch
+import pytest
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+@pytest.fixture
+def api():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
+    )
+@pytest.fixture
+def api_tokenized():
+    return LocalCompletionsAPI(
+        base_url="http://test-url.com",
+        model="EleutherAI/pythia-1b",
+        tokenizer_backend="huggingface",
+    )
+def test_create_payload_generate(api):
+    messages = ["Generate a story"]
+    gen_kwargs = {
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "until": ["The End"],
+        "do_sample": True,
+        "seed": 1234,
+    }
+    payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
+    assert payload == {
+        "prompt": ["Generate a story"],
+        "model": "gpt-3.5-turbo",
+        "max_tokens": 100,
+        "temperature": 0.7,
+        "stop": ["The End"],
+        "seed": 1234,
+    }
+def test_create_payload_loglikelihood(api):
+    messages = ["The capital of France is"]
+    payload = api._create_payload(messages, generate=False, gen_kwargs=None)
+    assert payload == {
+        "model": "gpt-3.5-turbo",
+        "prompt": ["The capital of France is"],
+        "max_tokens": 1,
+        "logprobs": 1,
+        "echo": True,
+        "temperature": 0,
+        "seed": 1234,
+    }
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            ["Hello, how are"],
+            True,
+            {"max_gen_toks": 100, "temperature": 0.7},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 100,
+                "temperature": 0.7,
+                "stop": ["<|endoftext|>"],
+                "seed": 1234,
+            },
+        ),
+        (
+            ["Hello, how are", "you"],
+            True,
+            {},
+            {
+                "prompt": "Hello, how are",
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 256,
+                "temperature": 0,
+                "stop": ["<|endoftext|>"],
+                "seed": 1234,
+            },
+        ),
+    ],
+)
+def test_model_generate_call_usage(
+    api, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+        # Act
+        result = api.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}
+@pytest.mark.parametrize(
+    "input_messages, generate, gen_kwargs, expected_payload",
+    [
+        (
+            [[1, 2, 3, 4, 5]],
+            False,
+            None,
+            {
+                "model": "EleutherAI/pythia-1b",
+                "prompt": [[1, 2, 3, 4, 5]],
+                "max_tokens": 1,
+                "logprobs": 1,
+                "echo": True,
+                "seed": 1234,
+                "temperature": 0,
+            },
+        ),
+    ],
+)
+def test_model_tokenized_call_usage(
+    api_tokenized, input_messages, generate, gen_kwargs, expected_payload
+):
+    with patch("requests.post") as mock_post:
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"result": "success"}
+        mock_post.return_value = mock_response
+        # Act
+        result = api_tokenized.model_call(
+            input_messages, generate=generate, gen_kwargs=gen_kwargs
+        )
+        # Assert
+        mock_post.assert_called_once()
+        _, kwargs = mock_post.call_args
+        assert "json" in kwargs
+        assert kwargs["json"] == expected_payload
+        assert result == {"result": "success"}

scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import hashlib
+import json
+import os
+import pickle
+import unittest
+from unittest.mock import patch
+from lm_eval.api.instance import Instance
+from lm_eval.models.gguf import GGUFLM
+base_url = "https://matthoffner-ggml-llm-api.hf.space"
+def gguf_completion_mock(base_url=None, **kwargs):
+    # Generate a hash from the parameters
+    hash_kwargs = {"base_url": base_url, **kwargs}
+    parameters_hash = hashlib.sha256(
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+    fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    else:
+        print("The file does not exist, attempting to write...")
+        if "stop" in kwargs:
+            result = {
+                "choices": [
+                    {
+                        "text": f"generated text until {kwargs['stop']}",
+                        "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
+                        "finish_reason": "length",
+                    }
+                ]
+            }
+        else:
+            # generated with # curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
+            result = {
+                "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
+                "object": "text_completion",
+                "created": 1700511361,
+                "model": "../llama-2-7b.Q8_0.gguf",
+                "choices": [
+                    {
+                        "text": "string(",
+                        "index": 0,
+                        "logprobs": {
+                            "text_offset": [0, 7],
+                            "token_logprobs": [None, -1.033263319857306],
+                            "tokens": [" string", "("],
+                            "top_logprobs": [
+                                None,
+                                {
+                                    "(": -1.033263319857306,
+                                    "[]": -2.6530743779017394,
+                                    ".": -3.0377145947291324,
+                                    "\n": -3.0399156750513976,
+                                    "_": -3.510376089937872,
+                                    " =": -3.6957918347193663,
+                                    ",": -3.9309459866358702,
+                                    " of": -4.2834550083949035,
+                                    '("': -4.322762841112799,
+                                    "()": -4.426229113466925,
+                                },
+                            ],
+                        },
+                        "finish_reason": "length",
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 2,
+                    "completion_tokens": 1,
+                    "total_tokens": 3,
+                },
+            }
+        try:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            print("Writing file at", fname)
+            with open(fname, "wb") as fh:
+                pickle.dump(result, fh)
+            print("File written successfully")
+        except Exception as e:
+            print("File writing failed:", e)
+        return result
+class GGUFLMTest(unittest.TestCase):
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_loglikelihood(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+        # Test loglikelihood
+        requests = [
+            Instance(
+                request_type="loglikelihood",
+                doc=args,
+                arguments=args,
+                idx=i,
+            )
+            for i, args in enumerate([("str", "ing"), ("str", "ing")])
+        ]
+        res = lm.loglikelihood(requests)
+        # Assert the loglikelihood response is correct
+        expected_res = [(logprob, True) for logprob in [0, 0]]
+        self.assertEqual(res, expected_res)
+    @patch(
+        "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
+    )
+    def test_generate_until(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+        # Test generate_until
+        requests = [
+            Instance(
+                request_type="generate_until",
+                doc={"input": doc},
+                arguments=(doc, {"until": stop}),
+                idx=i,
+            )
+            for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
+        ]
+        res = lm.generate_until(requests)
+        # Assert the generate_until response is correct
+        expected_res = ["generated text until stop1", "generated text until stop2"]
+        self.assertEqual(res, expected_res)
+    # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    # def test_loglikelihood_rolling(self, gguf_completion_mock):
+    #     lm = GGUFLM(base_url)
+    #     # Test loglikelihood_rolling
+    #     requests = ["input1", "input2"]
+    #     res = lm.loglikelihood_rolling(requests)
+    #     # Assert the loglikelihood_rolling response is correct
+    #     expected_res = [(-1.2345, True), (-1.2345, True)]
+    #     self.assertEqual(res, expected_res)
+if __name__ == "__main__":
+    unittest.main()

scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from lm_eval import tasks
+from lm_eval.api.instance import Instance
+from lm_eval.models.huggingface import HFLM
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+task_manager = tasks.TaskManager()
+TEST_STRING = "foo bar"
+class Test_HFLM:
+    torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    version_minor = sys.version_info.minor
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
+    generate_until_task = task_list["gsm8k"]  # type: ignore
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.set_fewshot_seed(1234)  # fewshot random generator seed
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until: list[Instance] = generate_until_task.instances
+    rolling_task = task_list["wikitext"]  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: list[Instance] = rolling_task.instances
+    MULTIPLE_CH_RES = [
+        -41.902435302734375,
+        -42.939308166503906,
+        -33.914180755615234,
+        -37.07139205932617,
+        -22.95258331298828,
+        -20.342208862304688,
+        -14.818366050720215,
+        -27.942853927612305,
+        -15.80704116821289,
+        -15.936427116394043,
+        -13.052018165588379,
+        -18.04828453063965,
+        -13.345029830932617,
+        -13.366025924682617,
+        -12.127134323120117,
+        -11.872495651245117,
+        -47.10598373413086,
+        -47.76410675048828,
+        -36.4406852722168,
+        -50.0289421081543,
+        -16.72093963623047,
+        -18.535587310791016,
+        -26.46993637084961,
+        -20.355995178222656,
+        -17.757919311523438,
+        -21.80595588684082,
+        -33.1990852355957,
+        -39.28636932373047,
+        -14.759679794311523,
+        -16.753942489624023,
+        -11.486852645874023,
+        -15.42177677154541,
+        -13.15798282623291,
+        -15.887393951416016,
+        -15.28614616394043,
+        -12.339089393615723,
+        -44.59441375732422,
+        -55.40888214111328,
+        -52.70050811767578,
+        -56.25089645385742,
+    ]
+    generate_until_RES = [
+        " The average of $2.50 each is $",
+        " A robe takes 2 bolts of blue fiber and half",
+        " $50,000 in repairs.\n\nQuestion",
+        " He runs 1 sprint 3 times a week.",
+        " They feed each of her chickens three cups of mixed",
+        " The price of the glasses is $5, but",
+        " The total percentage of students who said they like to",
+        " Carla is downloading a 200 GB file. Normally",
+        " John drives for 3 hours at a speed of 60",
+        " Eliza sells 4 tickets to 5 friends so she",
+    ]
+    ROLLING_RES = [
+        -3603.6328125,
+        -19779.23974609375,
+        -8834.16455078125,
+        -27967.591796875,
+        -7636.794982910156,
+        -9491.93505859375,
+        -41043.4248046875,
+        -8397.689819335938,
+        -45969.47155761719,
+        -7158.90625,
+    ]
+    LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
+        # log samples to CI
+        dir_path = Path("test_logs")
+        dir_path.mkdir(parents=True, exist_ok=True)
+        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
+        file_path = file_path.resolve()
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(str(x) for x in _res))
+        assert np.allclose(_res, _RES, atol=1e-2)
+        # check indices for Multiple Choice
+        argmax_RES, argmax_res = (
+            np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
+            np.argmax(np.array(_res).reshape(-1, 4), axis=1),
+        )
+        assert (argmax_RES == argmax_res).all()
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert res == self.generate_until_RES
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
+    def test_toc_encode(self) -> None:
+        res = self.LM.tok_encode(TEST_STRING)
+        assert res == [12110, 2534]
+    def test_toc_decode(self) -> None:
+        res = self.LM.tok_decode([12110, 2534])
+        assert res == TEST_STRING
+    def test_batch_encode(self) -> None:
+        res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
+        assert res == [[12110, 2534], [2009, 17374]]
+    def test_model_generate(self) -> None:
+        context = self.LM.tok_batch_encode([TEST_STRING])[0]
+        res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
+        res = self.LM.tok_decode(res[0])
+        assert res == "foo bar\n<bazhang>!info bar"

scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pytest
+import torch
+from lm_eval.models.neuron_optimum import wrap_constant_batch_size
+def test_wrap_constant_batch_size():
+    class Tester:
+        def __init__(self, batch_size):
+            self.batch_size = batch_size
+        @wrap_constant_batch_size
+        def test_constant_batch_size(self, inputs):
+            assert len(inputs) == self.batch_size
+            return inputs
+    batch_size_test = 8
+    for i in range(1, batch_size_test + 1):
+        tensor = torch.ones([i, 2, 2])
+        out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
+        torch.testing.assert_allclose(out, tensor)
+    with pytest.raises(ValueError):
+        Tester(batch_size=batch_size_test).test_constant_batch_size(
+            torch.ones([batch_size_test + 1, 2, 2])
+        )

scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import random
+import tempfile
+from pathlib import Path
+import pytest
+from optimum.intel import OVModelForCausalLM
+from transformers import AutoTokenizer
+from lm_eval import evaluator
+from lm_eval.api.registry import get_model
+SUPPORTED_ARCHITECTURES_TASKS = {
+    "facebook/opt-125m": "lambada_openai",
+    "hf-internal-testing/tiny-random-gpt2": "wikitext",
+}
+@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
+def test_evaluator(model_id, task):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        model = OVModelForCausalLM.from_pretrained(
+            model_id, export=True, use_cache=True
+        )
+        model.save_pretrained(tmpdirname)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tmpdirname)
+        lm = get_model("openvino").create_from_arg_string(
+            f"pretrained={tmpdirname}",
+            {
+                "batch_size": 1,
+                "device": "cpu",
+            },
+        )
+        def ll_fn(reqs):
+            for ctx, cont in [req.args for req in reqs]:
+                if len(ctx) == 0:
+                    continue
+                # space convention
+                assert ctx[-1] != " "
+                assert cont[0] == " " or ctx[-1] == "\n"
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.extend([(-random.random(), False)])
+            return res
+        def ll_perp_fn(reqs):
+            for (string,) in [req.args for req in reqs]:
+                assert isinstance(string, str)
+            res = []
+            random.seed(42)
+            for _ in reqs:
+                res.extend([-random.random()])
+            return res
+        lm.loglikelihood = ll_fn
+        lm.loglikelihood_rolling = ll_perp_fn
+        limit = 10
+        evaluator.simple_evaluate(
+            model=lm,
+            tasks=[task],
+            num_fewshot=0,
+            limit=limit,
+            bootstrap_iters=10,
+        )
+def test_ov_config():
+    """Test that if specified, a custom OpenVINO config is loaded correctly"""
+    model_id = "hf-internal-testing/tiny-random-gpt2"
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        config_file = str(Path(tmpdirname) / "ov_config.json")
+        with open(Path(config_file), "w", encoding="utf-8") as f:
+            f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
+        lm = get_model("openvino").create_from_arg_string(
+            f"pretrained={model_id},ov_config={config_file}"
+        )
+    assert (
+        lm.model.request.get_compiled_model().get_property(
+            "DYNAMIC_QUANTIZATION_GROUP_SIZE"
+        )
+        == 32
+    )

scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import List
+import pytest
+from lm_eval import tasks
+from lm_eval.api.instance import Instance
+task_manager = tasks.TaskManager()
+@pytest.mark.skip(reason="requires CUDA")
+class Test_VLLM:
+    vllm = pytest.importorskip("vllm")
+    try:
+        from lm_eval.models.vllm_causallms import VLLM
+        LM = VLLM(pretrained="EleutherAI/pythia-70m")
+    except ModuleNotFoundError:
+        pass
+    # torch.use_deterministic_algorithms(True)
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
+    generate_until_task = task_list["gsm8k"]  # type: ignore
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until: List[Instance] = generate_until_task.instances
+    rolling_task = task_list["wikitext"]  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: List[Instance] = rolling_task.instances
+    # TODO: make proper tests
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        assert len(res) == len(self.MULTIPLE_CH)
+        for x in res:
+            assert isinstance(x[0], float)
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert len(res) == len(self.generate_until)
+        for x in res:
+            assert isinstance(x, str)
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        for x in res:
+            assert isinstance(x, float)

scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6

scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960

scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751

scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}

scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"crows_pairs_french_disability": {"likelihood_difference": 0.31387939561315326, "likelihood_difference_stderr": 0.027598132299657168, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758}}, "versions": {"crows_pairs_french_disability": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209

scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f

scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"crows_pairs_french_nationality": {"likelihood_difference": 0.33534193269044926, "likelihood_difference_stderr": 0.01429836309463257, "pct_stereotype": 0.4743083003952569, "pct_stereotype_stderr": 0.031455431847992904}}, "versions": {"crows_pairs_french_nationality": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until ADDED Viewed

	@@ -0,0 +1 @@


1	+ ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba

scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4f122bfaa24901cff1ee686da0cf49ade7b6877c31a3daeb32c8cf2e328a77e
+size 153

scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d531b0854314516cad7d56c7e28a694bf23072429147b235e9c6534492867bb2
+size 2984

scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5

scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd

scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until ADDED Viewed

	@@ -0,0 +1 @@


1	+ b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"lambada_openai_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_cloze": 0}}