clemsail commited on
Commit
afb5696
·
verified ·
1 Parent(s): 5d4efec

chore: upload lm-eval-harness results

Browse files
evals/results_2026-04-15T13-55-33.588896.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "alias": "gsm8k",
5
+ "exact_match,strict-match": 0.844,
6
+ "exact_match_stderr,strict-match": 0.016243636028391094,
7
+ "exact_match,flexible-extract": 0.892,
8
+ "exact_match_stderr,flexible-extract": 0.013894535480989173
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k": []
13
+ },
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "tag": [
18
+ "math_word_problems"
19
+ ],
20
+ "dataset_path": "openai/gsm8k",
21
+ "dataset_name": "main",
22
+ "training_split": "train",
23
+ "test_split": "test",
24
+ "fewshot_split": "train",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{answer}}",
27
+ "unsafe_code": false,
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "fewshot_config": {
32
+ "sampler": "default",
33
+ "split": "train",
34
+ "process_docs": null,
35
+ "fewshot_indices": null,
36
+ "samples": null,
37
+ "doc_to_text": "Question: {{question}}\nAnswer:",
38
+ "doc_to_choice": null,
39
+ "doc_to_target": "{{answer}}",
40
+ "gen_prefix": null,
41
+ "fewshot_delimiter": "\n\n",
42
+ "target_delimiter": " "
43
+ },
44
+ "num_fewshot": 5,
45
+ "metric_list": [
46
+ {
47
+ "metric": "exact_match",
48
+ "aggregation": "mean",
49
+ "higher_is_better": true,
50
+ "ignore_case": true,
51
+ "ignore_punctuation": false,
52
+ "regexes_to_ignore": [
53
+ ",",
54
+ "\\$",
55
+ "(?s).*#### ",
56
+ "\\.$"
57
+ ]
58
+ }
59
+ ],
60
+ "output_type": "generate_until",
61
+ "generation_kwargs": {
62
+ "until": [
63
+ "Question:",
64
+ "</s>",
65
+ "<|im_end|>"
66
+ ],
67
+ "do_sample": false,
68
+ "temperature": 0.0
69
+ },
70
+ "repeats": 1,
71
+ "filter_list": [
72
+ {
73
+ "name": "strict-match",
74
+ "filter": [
75
+ {
76
+ "function": "regex",
77
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
78
+ },
79
+ {
80
+ "function": "take_first"
81
+ }
82
+ ]
83
+ },
84
+ {
85
+ "name": "flexible-extract",
86
+ "filter": [
87
+ {
88
+ "function": "regex",
89
+ "group_select": -1,
90
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
91
+ },
92
+ {
93
+ "function": "take_first"
94
+ }
95
+ ]
96
+ }
97
+ ],
98
+ "should_decontaminate": false,
99
+ "metadata": {
100
+ "version": 3.0,
101
+ "base_url": "http://localhost:8000/v1/chat/completions",
102
+ "model": "devstral",
103
+ "num_concurrent": 1
104
+ }
105
+ }
106
+ },
107
+ "versions": {
108
+ "gsm8k": 3.0
109
+ },
110
+ "n-shot": {
111
+ "gsm8k": 5
112
+ },
113
+ "higher_is_better": {
114
+ "gsm8k": {
115
+ "exact_match": true
116
+ }
117
+ },
118
+ "n-samples": {
119
+ "gsm8k": {
120
+ "original": 1319,
121
+ "effective": 500
122
+ }
123
+ },
124
+ "config": {
125
+ "model": "local-chat-completions",
126
+ "model_args": {
127
+ "base_url": "http://localhost:8000/v1/chat/completions",
128
+ "model": "devstral",
129
+ "num_concurrent": 1
130
+ },
131
+ "batch_size": "1",
132
+ "batch_sizes": [],
133
+ "device": "cuda:0",
134
+ "use_cache": null,
135
+ "limit": 500.0,
136
+ "bootstrap_iters": 100000,
137
+ "gen_kwargs": {},
138
+ "random_seed": 0,
139
+ "numpy_seed": 1234,
140
+ "torch_seed": 1234,
141
+ "fewshot_seed": 1234
142
+ },
143
+ "git_hash": null,
144
+ "date": 1776252935.2190523,
145
+ "pretty_env_info": "N/A (torch not installed)",
146
+ "transformers_version": "N/A",
147
+ "lm_eval_version": "0.4.11",
148
+ "upper_git_hash": null,
149
+ "task_hashes": {
150
+ "gsm8k": "d7e2c3373f72b8e34c540589c2c10df09e2f7b703350791f9e62af69c7f59ab0"
151
+ },
152
+ "model_source": "local-chat-completions",
153
+ "model_name": "devstral",
154
+ "model_name_sanitized": "devstral",
155
+ "system_instruction": null,
156
+ "system_instruction_sha": null,
157
+ "fewshot_as_multiturn": true,
158
+ "chat_template": "",
159
+ "chat_template_sha": null,
160
+ "total_evaluation_time_seconds": "1199.729275439866"
161
+ }