koichi12 commited on
Commit
0ba7ae8
·
verified ·
1 Parent(s): 277ed5d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/tests/models/test_api.py +149 -0
  2. scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py +152 -0
  3. scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py +148 -0
  4. scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py +26 -0
  5. scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py +92 -0
  6. scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py +50 -0
  7. scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood +1 -0
  8. scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json +1 -0
  9. scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood +1 -0
  10. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood +1 -0
  11. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json +1 -0
  12. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json +1 -0
  13. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood +1 -0
  14. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood +1 -0
  15. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood +1 -0
  16. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood +1 -0
  17. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json +1 -0
  18. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json +1 -0
  19. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json +1 -0
  20. scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood +1 -0
  21. scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood +1 -0
  22. scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json +1 -0
  23. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json +1 -0
  24. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood +1 -0
  25. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood +1 -0
  26. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json +1 -0
  27. scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until +1 -0
  28. scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json +1 -0
  29. scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl +3 -0
  30. scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl +3 -0
  31. scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood +1 -0
  32. scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json +1 -0
  33. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood +1 -0
  34. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json +1 -0
  35. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood +1 -0
  36. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood +1 -0
  37. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood +1 -0
  38. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json +1 -0
  39. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood +1 -0
  40. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood +1 -0
  41. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood +1 -0
  42. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json +1 -0
  43. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json +1 -0
  44. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood +1 -0
  45. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood +1 -0
  46. scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until +1 -0
  47. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood +1 -0
  48. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json +1 -0
  49. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood +1 -0
  50. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/models/test_api.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pytest
4
+
5
+ from lm_eval.models.openai_completions import LocalCompletionsAPI
6
+
7
+
8
+ @pytest.fixture
9
+ def api():
10
+ return LocalCompletionsAPI(
11
+ base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
12
+ )
13
+
14
+
15
+ @pytest.fixture
16
+ def api_tokenized():
17
+ return LocalCompletionsAPI(
18
+ base_url="http://test-url.com",
19
+ model="EleutherAI/pythia-1b",
20
+ tokenizer_backend="huggingface",
21
+ )
22
+
23
+
24
+ def test_create_payload_generate(api):
25
+ messages = ["Generate a story"]
26
+ gen_kwargs = {
27
+ "max_tokens": 100,
28
+ "temperature": 0.7,
29
+ "until": ["The End"],
30
+ "do_sample": True,
31
+ "seed": 1234,
32
+ }
33
+ payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
34
+
35
+ assert payload == {
36
+ "prompt": ["Generate a story"],
37
+ "model": "gpt-3.5-turbo",
38
+ "max_tokens": 100,
39
+ "temperature": 0.7,
40
+ "stop": ["The End"],
41
+ "seed": 1234,
42
+ }
43
+
44
+
45
+ def test_create_payload_loglikelihood(api):
46
+ messages = ["The capital of France is"]
47
+ payload = api._create_payload(messages, generate=False, gen_kwargs=None)
48
+
49
+ assert payload == {
50
+ "model": "gpt-3.5-turbo",
51
+ "prompt": ["The capital of France is"],
52
+ "max_tokens": 1,
53
+ "logprobs": 1,
54
+ "echo": True,
55
+ "temperature": 0,
56
+ "seed": 1234,
57
+ }
58
+
59
+
60
+ @pytest.mark.parametrize(
61
+ "input_messages, generate, gen_kwargs, expected_payload",
62
+ [
63
+ (
64
+ ["Hello, how are"],
65
+ True,
66
+ {"max_gen_toks": 100, "temperature": 0.7},
67
+ {
68
+ "prompt": "Hello, how are",
69
+ "model": "gpt-3.5-turbo",
70
+ "max_tokens": 100,
71
+ "temperature": 0.7,
72
+ "stop": ["<|endoftext|>"],
73
+ "seed": 1234,
74
+ },
75
+ ),
76
+ (
77
+ ["Hello, how are", "you"],
78
+ True,
79
+ {},
80
+ {
81
+ "prompt": "Hello, how are",
82
+ "model": "gpt-3.5-turbo",
83
+ "max_tokens": 256,
84
+ "temperature": 0,
85
+ "stop": ["<|endoftext|>"],
86
+ "seed": 1234,
87
+ },
88
+ ),
89
+ ],
90
+ )
91
+ def test_model_generate_call_usage(
92
+ api, input_messages, generate, gen_kwargs, expected_payload
93
+ ):
94
+ with patch("requests.post") as mock_post:
95
+ mock_response = MagicMock()
96
+ mock_response.json.return_value = {"result": "success"}
97
+ mock_post.return_value = mock_response
98
+
99
+ # Act
100
+ result = api.model_call(
101
+ input_messages, generate=generate, gen_kwargs=gen_kwargs
102
+ )
103
+
104
+ # Assert
105
+ mock_post.assert_called_once()
106
+ _, kwargs = mock_post.call_args
107
+ assert "json" in kwargs
108
+ assert kwargs["json"] == expected_payload
109
+ assert result == {"result": "success"}
110
+
111
+
112
+ @pytest.mark.parametrize(
113
+ "input_messages, generate, gen_kwargs, expected_payload",
114
+ [
115
+ (
116
+ [[1, 2, 3, 4, 5]],
117
+ False,
118
+ None,
119
+ {
120
+ "model": "EleutherAI/pythia-1b",
121
+ "prompt": [[1, 2, 3, 4, 5]],
122
+ "max_tokens": 1,
123
+ "logprobs": 1,
124
+ "echo": True,
125
+ "seed": 1234,
126
+ "temperature": 0,
127
+ },
128
+ ),
129
+ ],
130
+ )
131
+ def test_model_tokenized_call_usage(
132
+ api_tokenized, input_messages, generate, gen_kwargs, expected_payload
133
+ ):
134
+ with patch("requests.post") as mock_post:
135
+ mock_response = MagicMock()
136
+ mock_response.json.return_value = {"result": "success"}
137
+ mock_post.return_value = mock_response
138
+
139
+ # Act
140
+ result = api_tokenized.model_call(
141
+ input_messages, generate=generate, gen_kwargs=gen_kwargs
142
+ )
143
+
144
+ # Assert
145
+ mock_post.assert_called_once()
146
+ _, kwargs = mock_post.call_args
147
+ assert "json" in kwargs
148
+ assert kwargs["json"] == expected_payload
149
+ assert result == {"result": "success"}
scripts/yans/lm-evaluation-harness/tests/models/test_gguf.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import pickle
5
+ import unittest
6
+ from unittest.mock import patch
7
+
8
+ from lm_eval.api.instance import Instance
9
+ from lm_eval.models.gguf import GGUFLM
10
+
11
+
12
+ base_url = "https://matthoffner-ggml-llm-api.hf.space"
13
+
14
+
15
+ def gguf_completion_mock(base_url=None, **kwargs):
16
+ # Generate a hash from the parameters
17
+ hash_kwargs = {"base_url": base_url, **kwargs}
18
+ parameters_hash = hashlib.sha256(
19
+ json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
20
+ ).hexdigest()
21
+
22
+ fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
23
+
24
+ if os.path.exists(fname):
25
+ with open(fname, "rb") as fh:
26
+ return pickle.load(fh)
27
+ else:
28
+ print("The file does not exist, attempting to write...")
29
+ if "stop" in kwargs:
30
+ result = {
31
+ "choices": [
32
+ {
33
+ "text": f"generated text until {kwargs['stop']}",
34
+ "logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
35
+ "finish_reason": "length",
36
+ }
37
+ ]
38
+ }
39
+ else:
40
+ # generated with # curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
41
+ result = {
42
+ "id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
43
+ "object": "text_completion",
44
+ "created": 1700511361,
45
+ "model": "../llama-2-7b.Q8_0.gguf",
46
+ "choices": [
47
+ {
48
+ "text": "string(",
49
+ "index": 0,
50
+ "logprobs": {
51
+ "text_offset": [0, 7],
52
+ "token_logprobs": [None, -1.033263319857306],
53
+ "tokens": [" string", "("],
54
+ "top_logprobs": [
55
+ None,
56
+ {
57
+ "(": -1.033263319857306,
58
+ "[]": -2.6530743779017394,
59
+ ".": -3.0377145947291324,
60
+ "\n": -3.0399156750513976,
61
+ "_": -3.510376089937872,
62
+ " =": -3.6957918347193663,
63
+ ",": -3.9309459866358702,
64
+ " of": -4.2834550083949035,
65
+ '("': -4.322762841112799,
66
+ "()": -4.426229113466925,
67
+ },
68
+ ],
69
+ },
70
+ "finish_reason": "length",
71
+ }
72
+ ],
73
+ "usage": {
74
+ "prompt_tokens": 2,
75
+ "completion_tokens": 1,
76
+ "total_tokens": 3,
77
+ },
78
+ }
79
+
80
+ try:
81
+ os.makedirs(os.path.dirname(fname), exist_ok=True)
82
+ print("Writing file at", fname)
83
+ with open(fname, "wb") as fh:
84
+ pickle.dump(result, fh)
85
+ print("File written successfully")
86
+ except Exception as e:
87
+ print("File writing failed:", e)
88
+
89
+ return result
90
+
91
+
92
+ class GGUFLMTest(unittest.TestCase):
93
+ @patch(
94
+ "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
95
+ )
96
+ def test_loglikelihood(self, gguf_completion_mock):
97
+ lm = GGUFLM(base_url)
98
+
99
+ # Test loglikelihood
100
+ requests = [
101
+ Instance(
102
+ request_type="loglikelihood",
103
+ doc=args,
104
+ arguments=args,
105
+ idx=i,
106
+ )
107
+ for i, args in enumerate([("str", "ing"), ("str", "ing")])
108
+ ]
109
+ res = lm.loglikelihood(requests)
110
+
111
+ # Assert the loglikelihood response is correct
112
+ expected_res = [(logprob, True) for logprob in [0, 0]]
113
+ self.assertEqual(res, expected_res)
114
+
115
+ @patch(
116
+ "lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
117
+ )
118
+ def test_generate_until(self, gguf_completion_mock):
119
+ lm = GGUFLM(base_url)
120
+
121
+ # Test generate_until
122
+ requests = [
123
+ Instance(
124
+ request_type="generate_until",
125
+ doc={"input": doc},
126
+ arguments=(doc, {"until": stop}),
127
+ idx=i,
128
+ )
129
+ for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
130
+ ]
131
+
132
+ res = lm.generate_until(requests)
133
+
134
+ # Assert the generate_until response is correct
135
+ expected_res = ["generated text until stop1", "generated text until stop2"]
136
+ self.assertEqual(res, expected_res)
137
+
138
+ # @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
139
+ # def test_loglikelihood_rolling(self, gguf_completion_mock):
140
+ # lm = GGUFLM(base_url)
141
+
142
+ # # Test loglikelihood_rolling
143
+ # requests = ["input1", "input2"]
144
+ # res = lm.loglikelihood_rolling(requests)
145
+
146
+ # # Assert the loglikelihood_rolling response is correct
147
+ # expected_res = [(-1.2345, True), (-1.2345, True)]
148
+ # self.assertEqual(res, expected_res)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ unittest.main()
scripts/yans/lm-evaluation-harness/tests/models/test_huggingface.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import torch
9
+
10
+ from lm_eval import tasks
11
+ from lm_eval.api.instance import Instance
12
+ from lm_eval.models.huggingface import HFLM
13
+
14
+
15
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
+ task_manager = tasks.TaskManager()
17
+
18
+ TEST_STRING = "foo bar"
19
+
20
+
21
+ class Test_HFLM:
22
+ torch.use_deterministic_algorithms(True)
23
+ task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
24
+ version_minor = sys.version_info.minor
25
+ multiple_choice_task = task_list["arc_easy"] # type: ignore
26
+ multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
27
+ MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
28
+ generate_until_task = task_list["gsm8k"] # type: ignore
29
+ generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
30
+ generate_until_task.set_fewshot_seed(1234) # fewshot random generator seed
31
+ generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
32
+ generate_until: list[Instance] = generate_until_task.instances
33
+ rolling_task = task_list["wikitext"] # type: ignore
34
+ rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
35
+ ROLLING: list[Instance] = rolling_task.instances
36
+
37
+ MULTIPLE_CH_RES = [
38
+ -41.902435302734375,
39
+ -42.939308166503906,
40
+ -33.914180755615234,
41
+ -37.07139205932617,
42
+ -22.95258331298828,
43
+ -20.342208862304688,
44
+ -14.818366050720215,
45
+ -27.942853927612305,
46
+ -15.80704116821289,
47
+ -15.936427116394043,
48
+ -13.052018165588379,
49
+ -18.04828453063965,
50
+ -13.345029830932617,
51
+ -13.366025924682617,
52
+ -12.127134323120117,
53
+ -11.872495651245117,
54
+ -47.10598373413086,
55
+ -47.76410675048828,
56
+ -36.4406852722168,
57
+ -50.0289421081543,
58
+ -16.72093963623047,
59
+ -18.535587310791016,
60
+ -26.46993637084961,
61
+ -20.355995178222656,
62
+ -17.757919311523438,
63
+ -21.80595588684082,
64
+ -33.1990852355957,
65
+ -39.28636932373047,
66
+ -14.759679794311523,
67
+ -16.753942489624023,
68
+ -11.486852645874023,
69
+ -15.42177677154541,
70
+ -13.15798282623291,
71
+ -15.887393951416016,
72
+ -15.28614616394043,
73
+ -12.339089393615723,
74
+ -44.59441375732422,
75
+ -55.40888214111328,
76
+ -52.70050811767578,
77
+ -56.25089645385742,
78
+ ]
79
+ generate_until_RES = [
80
+ " The average of $2.50 each is $",
81
+ " A robe takes 2 bolts of blue fiber and half",
82
+ " $50,000 in repairs.\n\nQuestion",
83
+ " He runs 1 sprint 3 times a week.",
84
+ " They feed each of her chickens three cups of mixed",
85
+ " The price of the glasses is $5, but",
86
+ " The total percentage of students who said they like to",
87
+ " Carla is downloading a 200 GB file. Normally",
88
+ " John drives for 3 hours at a speed of 60",
89
+ " Eliza sells 4 tickets to 5 friends so she",
90
+ ]
91
+ ROLLING_RES = [
92
+ -3603.6328125,
93
+ -19779.23974609375,
94
+ -8834.16455078125,
95
+ -27967.591796875,
96
+ -7636.794982910156,
97
+ -9491.93505859375,
98
+ -41043.4248046875,
99
+ -8397.689819335938,
100
+ -45969.47155761719,
101
+ -7158.90625,
102
+ ]
103
+ LM = HFLM(pretrained="EleutherAI/pythia-70m", device="cpu", dtype="float32")
104
+
105
+ def test_logliklihood(self) -> None:
106
+ res = self.LM.loglikelihood(self.MULTIPLE_CH)
107
+ _RES, _res = self.MULTIPLE_CH_RES, [r[0] for r in res]
108
+ # log samples to CI
109
+ dir_path = Path("test_logs")
110
+ dir_path.mkdir(parents=True, exist_ok=True)
111
+
112
+ file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
113
+ file_path = file_path.resolve()
114
+ with open(file_path, "w", encoding="utf-8") as f:
115
+ f.write("\n".join(str(x) for x in _res))
116
+ assert np.allclose(_res, _RES, atol=1e-2)
117
+ # check indices for Multiple Choice
118
+ argmax_RES, argmax_res = (
119
+ np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
120
+ np.argmax(np.array(_res).reshape(-1, 4), axis=1),
121
+ )
122
+ assert (argmax_RES == argmax_res).all()
123
+
124
+ def test_generate_until(self) -> None:
125
+ res = self.LM.generate_until(self.generate_until)
126
+ assert res == self.generate_until_RES
127
+
128
+ def test_logliklihood_rolling(self) -> None:
129
+ res = self.LM.loglikelihood_rolling(self.ROLLING)
130
+ assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
131
+
132
+ def test_toc_encode(self) -> None:
133
+ res = self.LM.tok_encode(TEST_STRING)
134
+ assert res == [12110, 2534]
135
+
136
+ def test_toc_decode(self) -> None:
137
+ res = self.LM.tok_decode([12110, 2534])
138
+ assert res == TEST_STRING
139
+
140
+ def test_batch_encode(self) -> None:
141
+ res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
142
+ assert res == [[12110, 2534], [2009, 17374]]
143
+
144
+ def test_model_generate(self) -> None:
145
+ context = self.LM.tok_batch_encode([TEST_STRING])[0]
146
+ res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
147
+ res = self.LM.tok_decode(res[0])
148
+ assert res == "foo bar\n<bazhang>!info bar"
scripts/yans/lm-evaluation-harness/tests/models/test_neuron_optimum.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import torch
3
+
4
+ from lm_eval.models.neuron_optimum import wrap_constant_batch_size
5
+
6
+
7
+ def test_wrap_constant_batch_size():
8
+ class Tester:
9
+ def __init__(self, batch_size):
10
+ self.batch_size = batch_size
11
+
12
+ @wrap_constant_batch_size
13
+ def test_constant_batch_size(self, inputs):
14
+ assert len(inputs) == self.batch_size
15
+ return inputs
16
+
17
+ batch_size_test = 8
18
+ for i in range(1, batch_size_test + 1):
19
+ tensor = torch.ones([i, 2, 2])
20
+ out = Tester(batch_size=batch_size_test).test_constant_batch_size(tensor)
21
+ torch.testing.assert_allclose(out, tensor)
22
+
23
+ with pytest.raises(ValueError):
24
+ Tester(batch_size=batch_size_test).test_constant_batch_size(
25
+ torch.ones([batch_size_test + 1, 2, 2])
26
+ )
scripts/yans/lm-evaluation-harness/tests/models/test_openvino.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import tempfile
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from optimum.intel import OVModelForCausalLM
7
+ from transformers import AutoTokenizer
8
+
9
+ from lm_eval import evaluator
10
+ from lm_eval.api.registry import get_model
11
+
12
+
13
+ SUPPORTED_ARCHITECTURES_TASKS = {
14
+ "facebook/opt-125m": "lambada_openai",
15
+ "hf-internal-testing/tiny-random-gpt2": "wikitext",
16
+ }
17
+
18
+
19
+ @pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
20
+ def test_evaluator(model_id, task):
21
+ with tempfile.TemporaryDirectory() as tmpdirname:
22
+ model = OVModelForCausalLM.from_pretrained(
23
+ model_id, export=True, use_cache=True
24
+ )
25
+ model.save_pretrained(tmpdirname)
26
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
27
+ tokenizer.save_pretrained(tmpdirname)
28
+
29
+ lm = get_model("openvino").create_from_arg_string(
30
+ f"pretrained={tmpdirname}",
31
+ {
32
+ "batch_size": 1,
33
+ "device": "cpu",
34
+ },
35
+ )
36
+
37
+ def ll_fn(reqs):
38
+ for ctx, cont in [req.args for req in reqs]:
39
+ if len(ctx) == 0:
40
+ continue
41
+ # space convention
42
+ assert ctx[-1] != " "
43
+ assert cont[0] == " " or ctx[-1] == "\n"
44
+
45
+ res = []
46
+
47
+ random.seed(42)
48
+ for _ in reqs:
49
+ res.extend([(-random.random(), False)])
50
+
51
+ return res
52
+
53
+ def ll_perp_fn(reqs):
54
+ for (string,) in [req.args for req in reqs]:
55
+ assert isinstance(string, str)
56
+
57
+ res = []
58
+ random.seed(42)
59
+ for _ in reqs:
60
+ res.extend([-random.random()])
61
+
62
+ return res
63
+
64
+ lm.loglikelihood = ll_fn
65
+ lm.loglikelihood_rolling = ll_perp_fn
66
+
67
+ limit = 10
68
+ evaluator.simple_evaluate(
69
+ model=lm,
70
+ tasks=[task],
71
+ num_fewshot=0,
72
+ limit=limit,
73
+ bootstrap_iters=10,
74
+ )
75
+
76
+
77
+ def test_ov_config():
78
+ """Test that if specified, a custom OpenVINO config is loaded correctly"""
79
+ model_id = "hf-internal-testing/tiny-random-gpt2"
80
+ with tempfile.TemporaryDirectory() as tmpdirname:
81
+ config_file = str(Path(tmpdirname) / "ov_config.json")
82
+ with open(Path(config_file), "w", encoding="utf-8") as f:
83
+ f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
84
+ lm = get_model("openvino").create_from_arg_string(
85
+ f"pretrained={model_id},ov_config={config_file}"
86
+ )
87
+ assert (
88
+ lm.model.request.get_compiled_model().get_property(
89
+ "DYNAMIC_QUANTIZATION_GROUP_SIZE"
90
+ )
91
+ == 32
92
+ )
scripts/yans/lm-evaluation-harness/tests/models/test_vllm.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import pytest
4
+
5
+ from lm_eval import tasks
6
+ from lm_eval.api.instance import Instance
7
+
8
+
9
+ task_manager = tasks.TaskManager()
10
+
11
+
12
+ @pytest.mark.skip(reason="requires CUDA")
13
+ class Test_VLLM:
14
+ vllm = pytest.importorskip("vllm")
15
+ try:
16
+ from lm_eval.models.vllm_causallms import VLLM
17
+
18
+ LM = VLLM(pretrained="EleutherAI/pythia-70m")
19
+ except ModuleNotFoundError:
20
+ pass
21
+ # torch.use_deterministic_algorithms(True)
22
+ task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
23
+ multiple_choice_task = task_list["arc_easy"] # type: ignore
24
+ multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
25
+ MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
26
+ generate_until_task = task_list["gsm8k"] # type: ignore
27
+ generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
28
+ generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
29
+ generate_until: List[Instance] = generate_until_task.instances
30
+ rolling_task = task_list["wikitext"] # type: ignore
31
+ rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
32
+ ROLLING: List[Instance] = rolling_task.instances
33
+
34
+ # TODO: make proper tests
35
+ def test_logliklihood(self) -> None:
36
+ res = self.LM.loglikelihood(self.MULTIPLE_CH)
37
+ assert len(res) == len(self.MULTIPLE_CH)
38
+ for x in res:
39
+ assert isinstance(x[0], float)
40
+
41
+ def test_generate_until(self) -> None:
42
+ res = self.LM.generate_until(self.generate_until)
43
+ assert len(res) == len(self.generate_until)
44
+ for x in res:
45
+ assert isinstance(x, str)
46
+
47
+ def test_logliklihood_rolling(self) -> None:
48
+ res = self.LM.loglikelihood_rolling(self.ROLLING)
49
+ for x in res:
50
+ assert isinstance(x, float)
scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_determiner_noun_agreement_with_adj_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_2": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_determiner_noun_agreement_with_adjective_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adjective_1": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_principle_A_c_command": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_c_command": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_principle_A_reconstruction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_reconstruction": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960
scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751
scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}}
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_french_disability": {"likelihood_difference": 0.31387939561315326, "likelihood_difference_stderr": 0.027598132299657168, "pct_stereotype": 0.36363636363636365, "pct_stereotype_stderr": 0.05966637484671758}}, "versions": {"crows_pairs_french_disability": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_french_nationality": {"likelihood_difference": 0.33534193269044926, "likelihood_difference_stderr": 0.01429836309463257, "pct_stereotype": 0.4743083003952569, "pct_stereotype_stderr": 0.031455431847992904}}, "versions": {"crows_pairs_french_nationality": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-greedy_until ADDED
@@ -0,0 +1 @@
 
 
1
+ ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"ethics_justice": {"acc": 0.49556213017751477, "acc_stderr": 0.009616784279885177, "em": 0.057692307692307696}}, "versions": {"ethics_justice": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4f122bfaa24901cff1ee686da0cf49ade7b6877c31a3daeb32c8cf2e328a77e
3
+ size 153
scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d531b0854314516cad7d56c7e28a694bf23072429147b235e9c6534492867bb2
3
+ size 2984
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5
scripts/yans/lm-evaluation-harness/tests/testdata/headqa_en-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"headqa_en": {"acc": 0.23559445660102116, "acc_norm": 0.2447118891320204, "acc_norm_stderr": 0.008211629406841468, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_en": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-clinical_knowledge": {"acc": 0.23773584905660378, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713, "acc_stderr": 0.02619980880756191}}, "versions": {"hendrycksTest-clinical_knowledge": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-high_school_mathematics": {"acc": 0.22592592592592592, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558, "acc_stderr": 0.025497532639609553}}, "versions": {"hendrycksTest-high_school_mathematics": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-logical_fallacies": {"acc": 0.20245398773006135, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774, "acc_stderr": 0.03157065078911902}}, "versions": {"hendrycksTest-logical_fallacies": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-medical_genetics": {"acc": 0.27, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684, "acc_stderr": 0.0446196043338474}}, "versions": {"hendrycksTest-medical_genetics": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd
scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until ADDED
@@ -0,0 +1 @@
 
 
1
+ b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"lambada_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_it": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"lambada_openai_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_cloze": 0}}