Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/lm-evaluation-harness/tests/models/test_api.py +149 -0
- scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py +152 -0
- scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py +148 -0
- scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py +26 -0
- scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py +92 -0
- scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py +50 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl +3 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl +3 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/models/test_api.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from unittest.mock import MagicMock, patch
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from lm_eval.models.openai_completions import LocalCompletionsAPI
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@pytest.fixture
|
| 9 |
+
def api():
|
| 10 |
+
return LocalCompletionsAPI(
|
| 11 |
+
base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.fixture
|
| 16 |
+
def api_tokenized():
|
| 17 |
+
return LocalCompletionsAPI(
|
| 18 |
+
base_url="http://test-url.com",
|
| 19 |
+
model="EleutherAI/pythia-1b",
|
| 20 |
+
tokenizer_backend="huggingface",
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_create_payload_generate(api):
|
| 25 |
+
messages = ["Generate a story"]
|
| 26 |
+
gen_kwargs = {
|
| 27 |
+
"max_tokens": 100,
|
| 28 |
+
"temperature": 0.7,
|
| 29 |
+
"until": ["The End"],
|
| 30 |
+
"do_sample": True,
|
| 31 |
+
"seed": 1234,
|
| 32 |
+
}
|
| 33 |
+
payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
|
| 34 |
+
|
| 35 |
+
assert payload == {
|
| 36 |
+
"prompt": ["Generate a story"],
|
| 37 |
+
"model": "gpt-3.5-turbo",
|
| 38 |
+
"max_tokens": 100,
|
| 39 |
+
"temperature": 0.7,
|
| 40 |
+
"stop": ["The End"],
|
| 41 |
+
"seed": 1234,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_create_payload_loglikelihood(api):
|
| 46 |
+
messages = ["The capital of France is"]
|
| 47 |
+
payload = api._create_payload(messages, generate=False, gen_kwargs=None)
|
| 48 |
+
|
| 49 |
+
assert payload == {
|
| 50 |
+
"model": "gpt-3.5-turbo",
|
| 51 |
+
"prompt": ["The capital of France is"],
|
| 52 |
+
"max_tokens": 1,
|
| 53 |
+
"logprobs": 1,
|
| 54 |
+
"echo": True,
|
| 55 |
+
"temperature": 0,
|
| 56 |
+
"seed": 1234,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@pytest.mark.parametrize(
|
| 61 |
+
"input_messages, generate, gen_kwargs, expected_payload",
|
| 62 |
+
[
|
| 63 |
+
(
|
| 64 |
+
["Hello, how are"],
|
| 65 |
+
True,
|
| 66 |
+
{"max_gen_toks": 100, "temperature": 0.7},
|
| 67 |
+
{
|
| 68 |
+
"prompt": "Hello, how are",
|
| 69 |
+
"model": "gpt-3.5-turbo",
|
| 70 |
+
"max_tokens": 100,
|
| 71 |
+
"temperature": 0.7,
|
| 72 |
+
"stop": ["<|endoftext|>"],
|
| 73 |
+
"seed": 1234,
|
| 74 |
+
},
|
| 75 |
+
),
|
| 76 |
+
(
|
| 77 |
+
["Hello, how are", "you"],
|
| 78 |
+
True,
|
| 79 |
+
{},
|
| 80 |
+
{
|
| 81 |
+
"prompt": "Hello, how are",
|
| 82 |
+
"model": "gpt-3.5-turbo",
|
| 83 |
+
"max_tokens": 256,
|
| 84 |
+
"temperature": 0,
|
| 85 |
+
"stop": ["<|endoftext|>"],
|
| 86 |
+
"seed": 1234,
|
| 87 |
+
},
|
| 88 |
+
),
|
| 89 |
+
],
|
| 90 |
+
)
|
| 91 |
+
def test_model_generate_call_usage(
|
| 92 |
+
api, input_messages, generate, gen_kwargs, expected_payload
|
| 93 |
+
):
|
| 94 |
+
with patch("requests.post") as mock_post:
|
| 95 |
+
mock_response = MagicMock()
|
| 96 |
+
mock_response.json.return_value = {"result": "success"}
|
| 97 |
+
mock_post.return_value = mock_response
|
| 98 |
+
|
| 99 |
+
# Act
|
| 100 |
+
result = api.model_call(
|
| 101 |
+
input_messages, generate=generate, gen_kwargs=gen_kwargs
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Assert
|
| 105 |
+
mock_post.assert_called_once()
|
| 106 |
+
_, kwargs = mock_post.call_args
|
| 107 |
+
assert "json" in kwargs
|
| 108 |
+
assert kwargs["json"] == expected_payload
|
| 109 |
+
assert result == {"result": "success"}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@pytest.mark.parametrize(
|
| 113 |
+
"input_messages, generate, gen_kwargs, expected_payload",
|
| 114 |
+
[
|
| 115 |
+
(
|
| 116 |
+
[[1, 2, 3, 4, 5]],
|
| 117 |
+
False,
|
| 118 |
+
None,
|
| 119 |
+
{
|
| 120 |
+
"model": "EleutherAI/pythia-1b",
|
| 121 |
+
"prompt": [[1, 2, 3, 4, 5]],
|
| 122 |
+
"max_tokens": 1,
|
| 123 |
+
"logprobs": 1,
|
| 124 |
+
"echo": True,
|
| 125 |
+
"seed": 1234,
|
| 126 |
+
"temperature": 0,
|
| 127 |
+
},
|
| 128 |
+
),
|
| 129 |
+
],
|
| 130 |
+
)
|
| 131 |
+
def test_model_tokenized_call_usage(
|
| 132 |
+
api_tokenized, input_messages, generate, gen_kwargs, expected_payload
|
| 133 |
+
):
|
| 134 |
+
with patch("requests.post") as mock_post:
|
| 135 |
+
mock_response = MagicMock()
|
| 136 |
+
mock_response.json.return_value = {"result": "success"}
|
| 137 |
+
mock_post.return_value = mock_response
|
| 138 |
+
|
| 139 |
+
# Act
|
| 140 |
+
result = api_tokenized.model_call(
|
| 141 |
+
input_messages, generate=generate, gen_kwargs=gen_kwargs
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Assert
|
| 145 |
+
mock_post.assert_called_once()
|
| 146 |
+
_, kwargs = mock_post.call_args
|
| 147 |
+
assert "json" in kwargs
|
| 148 |
+
assert kwargs["json"] == expected_payload
|
| 149 |
+
assert result == {"result": "success"}
|
scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import pickle
|
| 5 |
+
import unittest
|
| 6 |
+
from unittest.mock import patch
|
| 7 |
+
|
| 8 |
+
from lm_eval.api.instance import Instance
|
| 9 |
+
from lm_eval.models.gguf import GGUFLM
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
base_url = "https://matthoffner-ggml-llm-api.hf.space"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def gguf_completion_mock(base_url=None, **kwargs):
|
| 16 |
+
# Generate a hash from the parameters
|
| 17 |
+
hash_kwargs = {"base_url": base_url, **kwargs}
|
| 18 |
+
parameters_hash = hashlib.sha256(
|
| 19 |
+
json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
|
| 20 |
+
).hexdigest()
|
| 21 |
+
|
| 22 |
+
fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
|
| 23 |
+
|
| 24 |
+
if os.path.exists(fname):
|
| 25 |
+
with open(fname, "rb") as fh:
|
| 26 |
+
return pickle.load(fh)
|
| 27 |
+
else:
|
| 28 |
+
print("The file does not exist, attempting to write...")
|
| 29 |
+
if "stop" in kwargs:
|
| 30 |
+
result = {
|
| 31 |
+
"choices": [
|
| 32 |
+
{
|
| 33 |
+
"text": f"generated text until {kwargs['stop']}",
|
| 34 |
+
"logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
|
| 35 |
+
"finish_reason": "length",
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
}
|
| 39 |
+
else:
|
| 40 |
+
# generated with # curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
|
| 41 |
+
result = {
|
| 42 |
+
"id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
|
| 43 |
+
"object": "text_completion",
|
| 44 |
+
"created": 1700511361,
|
| 45 |
+
"model": "../llama-2-7b.Q8_0.gguf",
|
| 46 |
+
"choices": [
|
| 47 |
+
{
|
| 48 |
+
"text": "string(",
|
| 49 |
+
"index": 0,
|
| 50 |
+
"logprobs": {
|
| 51 |
+
"text_offset": [0, 7],
|
| 52 |
+
"token_logprobs": [None, -1.033263319857306],
|
| 53 |
+
"tokens": [" string", "("],
|
| 54 |
+
"top_logprobs": [
|
| 55 |
+
None,
|
| 56 |
+
{
|
| 57 |
+
"(": -1.033263319857306,
|
| 58 |
+
"[]": -2.6530743779017394,
|
| 59 |
+
".": -3.0377145947291324,
|
| 60 |
+
"\n": -3.0399156750513976,
|
| 61 |
+
"_": -3.510376089937872,
|
| 62 |
+
" =": -3.6957918347193663,
|
| 63 |
+
",": -3.9309459866358702,
|
| 64 |
+
" of": -4.2834550083949035,
|
| 65 |
+
'("': -4.322762841112799,
|
| 66 |
+
"()": -4.426229113466925,
|
| 67 |
+
},
|
| 68 |
+
],
|
| 69 |
+
},
|
| 70 |
+
"finish_reason": "length",
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"usage": {
|
| 74 |
+
"prompt_tokens": 2,
|
| 75 |
+
"completion_tokens": 1,
|
| 76 |
+
"total_tokens": 3,
|
| 77 |
+
},
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
os.makedirs(os.path.dirname(fname), exist_ok=True)
|
| 82 |
+
print("Writing file at", fname)
|
| 83 |
+
with open(fname, "wb") as fh:
|
| 84 |
+
pickle.dump(result, fh)
|
| 85 |
+
print("File written successfully")
|
| 86 |
+
except Exception as e:
|
| 87 |
+
print("File writing failed:", e)
|
| 88 |
+
|
| 89 |
+
return result
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class GGUFLMTest(unittest.TestCase):
|
| 93 |
+
@patch(
|
| 94 |
+
"lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
|
| 95 |
+
)
|
| 96 |
+
def test_loglikelihood(self, gguf_completion_mock):
|
| 97 |
+
lm = GGUFLM(base_url)
|
| 98 |
+
|
| 99 |
+
# Test loglikelihood
|
| 100 |
+
requests = [
|
| 101 |
+
Instance(
|
| 102 |
+
request_type="loglikelihood",
|
| 103 |
+
doc=args,
|
| 104 |
+
arguments=args,
|
| 105 |
+
idx=i,
|
| 106 |
+
)
|
| 107 |
+
for i, args in enumerate([("str", "ing"), ("str", "ing")])
|
| 108 |
+
]
|
| 109 |
+
res = lm.loglikelihood(requests)
|
| 110 |
+
|
| 111 |
+
# Assert the loglikelihood response is correct
|
| 112 |
+
expected_res = [(logprob, True) for logprob in [0, 0]]
|
| 113 |
+
self.assertEqual(res, expected_res)
|
| 114 |
+
|
| 115 |
+
@patch(
|
| 116 |
+
"lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
|
| 117 |
+
)
|
| 118 |
+
def test_generate_until(self, gguf_completion_mock):
|
| 119 |
+
lm = GGUFLM(base_url)
|
| 120 |
+
|
| 121 |
+
# Test generate_until
|
| 122 |
+
requests = [
|
| 123 |
+
Instance(
|
| 124 |
+
request_type="generate_until",
|
| 125 |
+
doc={"input": doc},
|
| 126 |
+
arguments=(doc, {"until": stop}),
|
| 127 |
+
idx=i,
|
| 128 |
+
)
|
| 129 |
+
for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
res = lm.generate_until(requests)
|
| 133 |
+
|
| 134 |
+
# Assert the generate_until response is correct
|
| 135 |
+
expected_res = ["generated text until stop1", "generated text until stop2"]
|
| 136 |
+
self.assertEqual(res, expected_res)
|
| 137 |
+
|
| 138 |
+
# @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
|
| 139 |
+
# def test_loglikelihood_rolling(self, gguf_completion_mock):
|
| 140 |
+
# lm = GGUFLM(base_url)
|
| 141 |
+
|
| 142 |
+
# # Test loglikelihood_rolling
|
| 143 |
+
# requests = ["input1", "input2"]
|
| 144 |
+
# res = lm.loglikelihood_rolling(requests)
|
| 145 |
+
|
| 146 |
+
# # Assert the loglikelihood_rolling response is correct
|
| 147 |
+
# expected_res = [(-1.2345, True), (-1.2345, True)]
|
| 148 |
+
# self.assertEqual(res, expected_res)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
unittest.main()
|
scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
from lm_eval import tasks
|
| 11 |
+
from lm_eval.api.instance import Instance
|
| 12 |
+
from lm_eval.models.huggingface import HFLM
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 16 |
+
task_manager = tasks.TaskManager()
|
| 17 |
+
|
| 18 |
+
TEST_STRING = "foo bar"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Test_HFLM:
|
| 22 |
+
torch.use_deterministic_algorithms(True)
|
| 23 |
+
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
|
| 24 |
+
version_minor = sys.version_info.minor
|
| 25 |
+
multiple_choice_task = task_list["arc_easy"] # type: ignore
|
| 26 |
+
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 27 |
+
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
|
| 28 |
+
generate_until_task = task_list["gsm8k"] # type: ignore
|
| 29 |
+
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
|
| 30 |
+
generate_until_task.set_fewshot_seed(1234) # fewshot random generator seed
|
| 31 |
+
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 32 |
+
generate_until: list[Instance] = generate_until_task.instances
|
| 33 |
+
rolling_task = task_list["wikitext"] # type: ignore
|
| 34 |
+
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 35 |
+
ROLLING: list[Instance] = rolling_task.instances
|
| 36 |
+
|
| 37 |
+
MULTIPLE_CH_RES = [
|
| 38 |
+
-41.902435302734375,
|
| 39 |
+
-42.939308166503906,
|
| 40 |
+
-33.914180755615234,
|
| 41 |
+
-37.07139205932617,
|
| 42 |
+
-22.95258331298828,
|
| 43 |
+
-20.342208862304688,
|
| 44 |
+
-14.818366050720215,
|
| 45 |
+
-27.942853927612305,
|
| 46 |
+
-15.80704116821289,
|
| 47 |
+
-15.936427116394043,
|
| 48 |
+
-13.052018165588379,
|
| 49 |
+
-18.04828453063965,
|
| 50 |
+
-13.345029830932617,
|
| 51 |
+
-13.366025924682617,
|
| 52 |
+
-12.127134323120117,
|
| 53 |
+
-11.872495651245117,
|
| 54 |
+
-47.10598373413086,
|
| 55 |
+
-47.76410675048828,
|
| 56 |
+
-36.4406852722168,
|
| 57 |
+
-50.0289421081543,
|
| 58 |
+
-16.72093963623047,
|
| 59 |
+
-18.535587310791016,
|
| 60 |
+
-26.46993637084961,
|
| 61 |
+
-20.355995178222656,
|
| 62 |
+
-17.757919311523438,
|
| 63 |
+
-21.80595588684082,
|
| 64 |
+
-33.1990852355957,
|
| 65 |
+
-39.28636932373047,
|
| 66 |
+
-14.759679794311523,
|
| 67 |
+
-16.753942489624023,
|
| 68 |
+
-11.486852645874023,
|
| 69 |
+
-15.42177677154541,
|
| 70 |
+
-13.15798282623291,
|
| 71 |
+
-15.887393951416016,
|
| 72 |
+
-15.28614616394043,
|
| 73 |
+
-12.339089393615723,
|
| 74 |
+
-44.59441375732422,
|
| 75 |
+
-55.40888214111328,
|
| 76 |
+
-52.70050811767578,
|
| 77 |
+
-56.25089645385742,
|
| 78 |
+
]
|
| 79 |
+
generate_until_RES = [
|
| 80 |
+
" The average of $2.50 each is $",
|
| 81 |
+
" A robe takes 2 bolts of blue fiber and half",
|
| 82 |
+
" $50,000 in repairs.\n\nQuestion",
|
| 83 |
+
" He runs 1 sprint 3 times a week.",
|
| 84 |
+
" They feed each of her chickens three cups of mixed",
|
| 85 |
+
" The price of the glasses is $5, but",
|
| 86 |
+
" The total percentage of students who said they like to",
|
| 87 |
+
" Carla is downloading a 200 GB file. Normally",
|
| 88 |
+
" John drives for 3 hours at a speed of 60",
|
| 89 |
+
" Eliza sells 4 tickets to 5 friends so she",
|
| 90 |
+
]
|
| 91 |
+
ROLLING_RES = [
|
| 92 |
+
-3603.6328125,
|
| 93 |
+
-19779.23974609375,
|
| 94 |
+
-8834.16455078125,
|
| 95 |
+
-27967.591796875,
|
| 96 |
+
-7636.794982910156,
|
| 97 |
+
-9491.93505859375,
|
| 98 |
+
-41043.4248046875,
|
| 99 |
+
-8397.689819335938,
|
| 100 |
+
-45969.47155761719,
|
| 101 |
+
-7158.90625,
|
| 102 |
+
]
|
| 103 |
+
LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
|
| 104 |
+
|
| 105 |
+
def test_logliklihood(self) -> None:
|
| 106 |
+
res = self.LM.loglikelihood(self.MULTIPLE_CH)
|
| 107 |
+
_RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
|
| 108 |
+
# log samples to CI
|
| 109 |
+
dir_path = Path("test_logs")
|
| 110 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
|
| 113 |
+
file_path = file_path.resolve()
|
| 114 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 115 |
+
f.write("\n".join(str(x) for x in _res))
|
| 116 |
+
assert np.allclose(_res, _RES, atol=1e-2)
|
| 117 |
+
# check indices for Multiple Choice
|
| 118 |
+
argmax_RES, argmax_res = (
|
| 119 |
+
np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
|
| 120 |
+
np.argmax(np.array(_res).reshape(-1, 4), axis=1),
|
| 121 |
+
)
|
| 122 |
+
assert (argmax_RES == argmax_res).all()
|
| 123 |
+
|
| 124 |
+
def test_generate_until(self) -> None:
|
| 125 |
+
res = self.LM.generate_until(self.generate_until)
|
| 126 |
+
assert res == self.generate_until_RES
|
| 127 |
+
|
| 128 |
+
def test_logliklihood_rolling(self) -> None:
|
| 129 |
+
res = self.LM.loglikelihood_rolling(self.ROLLING)
|
| 130 |
+
assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
|
| 131 |
+
|
| 132 |
+
def test_toc_encode(self) -> None:
|
| 133 |
+
res = self.LM.tok_encode(TEST_STRING)
|
| 134 |
+
assert res == [12110, 2534]
|
| 135 |
+
|
| 136 |
+
def test_toc_decode(self) -> None:
|
| 137 |
+
res = self.LM.tok_decode([12110, 2534])
|
| 138 |
+
assert res == TEST_STRING
|
| 139 |
+
|
| 140 |
+
def test_batch_encode(self) -> None:
|
| 141 |
+
res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
|
| 142 |
+
assert res == [[12110, 2534], [2009, 17374]]
|
| 143 |
+
|
| 144 |
+
def test_model_generate(self) -> None:
|
| 145 |
+
context = self.LM.tok_batch_encode([TEST_STRING])[0]
|
| 146 |
+
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
|
| 147 |
+
res = self.LM.tok_decode(res[0])
|
| 148 |
+
assert res == "foo bar\n<bazhang>!info bar"
|
scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
from lm_eval.models.neuron_optimum import wrap_constant_batch_size
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_wrap_constant_batch_size():
|
| 8 |
+
class Tester:
|
| 9 |
+
def __init__(self, batch_size):
|
| 10 |
+
self.batch_size = batch_size
|
| 11 |
+
|
| 12 |
+
@wrap_constant_batch_size
|
| 13 |
+
def test_constant_batch_size(self, inputs):
|
| 14 |
+
assert len(inputs) == self.batch_size
|
| 15 |
+
return inputs
|
| 16 |
+
|
| 17 |
+
batch_size_test = 8
|
| 18 |
+
for i in range(1, batch_size_test + 1):
|
| 19 |
+
tensor = torch.ones([i, 2, 2])
|
| 20 |
+
out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
|
| 21 |
+
torch.testing.assert_allclose(out, tensor)
|
| 22 |
+
|
| 23 |
+
with pytest.raises(ValueError):
|
| 24 |
+
Tester(batch_size=batch_size_test).test_constant_batch_size(
|
| 25 |
+
torch.ones([batch_size_test + 1, 2, 2])
|
| 26 |
+
)
|
scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import tempfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from optimum.intel import OVModelForCausalLM
|
| 7 |
+
from transformers import AutoTokenizer
|
| 8 |
+
|
| 9 |
+
from lm_eval import evaluator
|
| 10 |
+
from lm_eval.api.registry import get_model
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SUPPORTED_ARCHITECTURES_TASKS = {
|
| 14 |
+
"facebook/opt-125m": "lambada_openai",
|
| 15 |
+
"hf-internal-testing/tiny-random-gpt2": "wikitext",
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
|
| 20 |
+
def test_evaluator(model_id, task):
|
| 21 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 22 |
+
model = OVModelForCausalLM.from_pretrained(
|
| 23 |
+
model_id, export=True, use_cache=True
|
| 24 |
+
)
|
| 25 |
+
model.save_pretrained(tmpdirname)
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 27 |
+
tokenizer.save_pretrained(tmpdirname)
|
| 28 |
+
|
| 29 |
+
lm = get_model("openvino").create_from_arg_string(
|
| 30 |
+
f"pretrained={tmpdirname}",
|
| 31 |
+
{
|
| 32 |
+
"batch_size": 1,
|
| 33 |
+
"device": "cpu",
|
| 34 |
+
},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
def ll_fn(reqs):
|
| 38 |
+
for ctx, cont in [req.args for req in reqs]:
|
| 39 |
+
if len(ctx) == 0:
|
| 40 |
+
continue
|
| 41 |
+
# space convention
|
| 42 |
+
assert ctx[-1] != " "
|
| 43 |
+
assert cont[0] == " " or ctx[-1] == "\n"
|
| 44 |
+
|
| 45 |
+
res = []
|
| 46 |
+
|
| 47 |
+
random.seed(42)
|
| 48 |
+
for _ in reqs:
|
| 49 |
+
res.extend([(-random.random(), False)])
|
| 50 |
+
|
| 51 |
+
return res
|
| 52 |
+
|
| 53 |
+
def ll_perp_fn(reqs):
|
| 54 |
+
for (string,) in [req.args for req in reqs]:
|
| 55 |
+
assert isinstance(string, str)
|
| 56 |
+
|
| 57 |
+
res = []
|
| 58 |
+
random.seed(42)
|
| 59 |
+
for _ in reqs:
|
| 60 |
+
res.extend([-random.random()])
|
| 61 |
+
|
| 62 |
+
return res
|
| 63 |
+
|
| 64 |
+
lm.loglikelihood = ll_fn
|
| 65 |
+
lm.loglikelihood_rolling = ll_perp_fn
|
| 66 |
+
|
| 67 |
+
limit = 10
|
| 68 |
+
evaluator.simple_evaluate(
|
| 69 |
+
model=lm,
|
| 70 |
+
tasks=[task],
|
| 71 |
+
num_fewshot=0,
|
| 72 |
+
limit=limit,
|
| 73 |
+
bootstrap_iters=10,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_ov_config():
|
| 78 |
+
"""Test that if specified, a custom OpenVINO config is loaded correctly"""
|
| 79 |
+
model_id = "hf-internal-testing/tiny-random-gpt2"
|
| 80 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 81 |
+
config_file = str(Path(tmpdirname) / "ov_config.json")
|
| 82 |
+
with open(Path(config_file), "w", encoding="utf-8") as f:
|
| 83 |
+
f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
|
| 84 |
+
lm = get_model("openvino").create_from_arg_string(
|
| 85 |
+
f"pretrained={model_id},ov_config={config_file}"
|
| 86 |
+
)
|
| 87 |
+
assert (
|
| 88 |
+
lm.model.request.get_compiled_model().get_property(
|
| 89 |
+
"DYNAMIC_QUANTIZATION_GROUP_SIZE"
|
| 90 |
+
)
|
| 91 |
+
== 32
|
| 92 |
+
)
|
scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from lm_eval import tasks
|
| 6 |
+
from lm_eval.api.instance import Instance
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
task_manager = tasks.TaskManager()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@pytest.mark.skip(reason="requires CUDA")
|
| 13 |
+
class Test_VLLM:
|
| 14 |
+
vllm = pytest.importorskip("vllm")
|
| 15 |
+
try:
|
| 16 |
+
from lm_eval.models.vllm_causallms import VLLM
|
| 17 |
+
|
| 18 |
+
LM = VLLM(pretrained="EleutherAI/pythia-70m")
|
| 19 |
+
except ModuleNotFoundError:
|
| 20 |
+
pass
|
| 21 |
+
# torch.use_deterministic_algorithms(True)
|
| 22 |
+
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
|
| 23 |
+
multiple_choice_task = task_list["arc_easy"] # type: ignore
|
| 24 |
+
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 25 |
+
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
|
| 26 |
+
generate_until_task = task_list["gsm8k"] # type: ignore
|
| 27 |
+
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
|
| 28 |
+
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 29 |
+
generate_until: List[Instance] = generate_until_task.instances
|
| 30 |
+
rolling_task = task_list["wikitext"] # type: ignore
|
| 31 |
+
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
|
| 32 |
+
ROLLING: List[Instance] = rolling_task.instances
|
| 33 |
+
|
| 34 |
+
# TODO: make proper tests
|
| 35 |
+
def test_logliklihood(self) -> None:
|
| 36 |
+
res = self.LM.loglikelihood(self.MULTIPLE_CH)
|
| 37 |
+
assert len(res) == len(self.MULTIPLE_CH)
|
| 38 |
+
for x in res:
|
| 39 |
+
assert isinstance(x[0], float)
|
| 40 |
+
|
| 41 |
+
def test_generate_until(self) -> None:
|
| 42 |
+
res = self.LM.generate_until(self.generate_until)
|
| 43 |
+
assert len(res) == len(self.generate_until)
|
| 44 |
+
for x in res:
|
| 45 |
+
assert isinstance(x, str)
|
| 46 |
+
|
| 47 |
+
def test_logliklihood_rolling(self) -> None:
|
| 48 |
+
res = self.LM.loglikelihood_rolling(self.ROLLING)
|
| 49 |
+
for x in res:
|
| 50 |
+
assert isinstance(x, float)
|
scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6
|
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960
|
scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751
|
scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"crows_pairs_french_disability": {"likelihood_difference": 0.31387939561315326, "likelihood_difference_stderr": 0.027598132299657168, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758}}, "versions": {"crows_pairs_french_disability": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"crows_pairs_french_nationality": {"likelihood_difference": 0.33534193269044926, "likelihood_difference_stderr": 0.01429836309463257, "pct_stereotype": 0.4743083003952569, "pct_stereotype_stderr": 0.031455431847992904}}, "versions": {"crows_pairs_french_nationality": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba
|
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4f122bfaa24901cff1ee686da0cf49ade7b6877c31a3daeb32c8cf2e328a77e
|
| 3 |
+
size 153
|
scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d531b0854314516cad7d56c7e28a694bf23072429147b235e9c6534492867bb2
|
| 3 |
+
size 2984
|
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
|
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd
|
scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"results": {"lambada_openai_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_cloze": 0}}
|