jaothan commited on
Commit
aa73876
·
verified ·
1 Parent(s): b2291ea

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +13 -11
  3. app.py +430 -0
  4. common.py +652 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,11 +1,13 @@
1
- ---
2
- title: Bench
3
- emoji: 📉
4
- colorFrom: indigo
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ ---
2
+ title: MT Bench
3
+ emoji: 📊
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.40.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 qa_browser.py --share
4
+ """
5
+
6
+ import argparse
7
+ from collections import defaultdict
8
+ import re
9
+
10
+ import gradio as gr
11
+
12
+ from common import (
13
+ load_questions,
14
+ load_model_answers,
15
+ load_single_model_judgments,
16
+ load_pairwise_model_judgments,
17
+ resolve_single_judgment_dict,
18
+ resolve_pairwise_judgment_dict,
19
+ get_single_judge_explanation,
20
+ get_pairwise_judge_explanation,
21
+ )
22
+
23
+
24
+ questions = []
25
+ model_answers = {}
26
+
27
+ model_judgments_normal_single = {}
28
+ model_judgments_math_single = {}
29
+
30
+ model_judgments_normal_pairwise = {}
31
+ model_judgments_math_pairwise = {}
32
+
33
+ question_selector_map = {}
34
+ category_selector_map = defaultdict(list)
35
+
36
+
37
+ def display_question(category_selector, request: gr.Request):
38
+ choices = category_selector_map[category_selector]
39
+ return gr.Dropdown.update(
40
+ value=choices[0],
41
+ choices=choices,
42
+ )
43
+
44
+
45
+ def display_pairwise_answer(
46
+ question_selector, model_selector1, model_selector2, request: gr.Request
47
+ ):
48
+ q = question_selector_map[question_selector]
49
+ qid = q["question_id"]
50
+
51
+ ans1 = model_answers[model_selector1][qid]
52
+ ans2 = model_answers[model_selector2][qid]
53
+
54
+ chat_mds = pairwise_to_gradio_chat_mds(q, ans1, ans2)
55
+ gamekey = (qid, model_selector1, model_selector2)
56
+
57
+ judgment_dict = resolve_pairwise_judgment_dict(
58
+ q,
59
+ model_judgments_normal_pairwise,
60
+ model_judgments_math_pairwise,
61
+ multi_turn=False,
62
+ )
63
+
64
+ explanation = (
65
+ "##### Model Judgment (first turn)\n"
66
+ + get_pairwise_judge_explanation(gamekey, judgment_dict)
67
+ )
68
+
69
+ judgment_dict_turn2 = resolve_pairwise_judgment_dict(
70
+ q,
71
+ model_judgments_normal_pairwise,
72
+ model_judgments_math_pairwise,
73
+ multi_turn=True,
74
+ )
75
+
76
+ explanation_turn2 = (
77
+ "##### Model Judgment (second turn)\n"
78
+ + get_pairwise_judge_explanation(gamekey, judgment_dict_turn2)
79
+ )
80
+
81
+ return chat_mds + [explanation] + [explanation_turn2]
82
+
83
+
84
+ def display_single_answer(question_selector, model_selector1, request: gr.Request):
85
+ q = question_selector_map[question_selector]
86
+ qid = q["question_id"]
87
+
88
+ ans1 = model_answers[model_selector1][qid]
89
+
90
+ chat_mds = single_to_gradio_chat_mds(q, ans1)
91
+ gamekey = (qid, model_selector1)
92
+
93
+ judgment_dict = resolve_single_judgment_dict(
94
+ q, model_judgments_normal_single, model_judgments_math_single, multi_turn=False
95
+ )
96
+
97
+ explanation = "##### Model Judgment (first turn)\n" + get_single_judge_explanation(
98
+ gamekey, judgment_dict
99
+ )
100
+
101
+ judgment_dict_turn2 = resolve_single_judgment_dict(
102
+ q, model_judgments_normal_single, model_judgments_math_single, multi_turn=True
103
+ )
104
+
105
+ explanation_turn2 = (
106
+ "##### Model Judgment (second turn)\n"
107
+ + get_single_judge_explanation(gamekey, judgment_dict_turn2)
108
+ )
109
+
110
+ return chat_mds + [explanation] + [explanation_turn2]
111
+
112
+
113
+ newline_pattern1 = re.compile("\n\n(\d+\. )")
114
+ newline_pattern2 = re.compile("\n\n(- )")
115
+
116
+
117
+ def post_process_answer(x):
118
+ """Fix Markdown rendering problems."""
119
+ x = x.replace("\u2022", "- ")
120
+ x = re.sub(newline_pattern1, "\n\g<1>", x)
121
+ x = re.sub(newline_pattern2, "\n\g<1>", x)
122
+ return x
123
+
124
+
125
+ def pairwise_to_gradio_chat_mds(question, ans_a, ans_b, turn=None):
126
+ end = len(question["turns"]) if turn is None else turn + 1
127
+
128
+ mds = ["", "", "", "", "", "", ""]
129
+ for i in range(end):
130
+ base = i * 3
131
+ if i == 0:
132
+ mds[base + 0] = "##### User\n" + question["turns"][i]
133
+ else:
134
+ mds[base + 0] = "##### User's follow-up question \n" + question["turns"][i]
135
+ mds[base + 1] = "##### Assistant A\n" + post_process_answer(
136
+ ans_a["choices"][0]["turns"][i].strip()
137
+ )
138
+ mds[base + 2] = "##### Assistant B\n" + post_process_answer(
139
+ ans_b["choices"][0]["turns"][i].strip()
140
+ )
141
+
142
+ ref = question.get("reference", ["", ""])
143
+
144
+ ref_md = ""
145
+ if turn is None:
146
+ if ref[0] != "" or ref[1] != "":
147
+ mds[6] = f"##### Reference Solution\nQ1. {ref[0]}\nQ2. {ref[1]}"
148
+ else:
149
+ x = ref[turn] if turn < len(ref) else ""
150
+ if x:
151
+ mds[6] = f"##### Reference Solution\n{ref[turn]}"
152
+ else:
153
+ mds[6] = ""
154
+ return mds
155
+
156
+
157
+ def single_to_gradio_chat_mds(question, ans, turn=None):
158
+ end = len(question["turns"]) if turn is None else turn + 1
159
+
160
+ mds = ["", "", "", "", ""]
161
+ for i in range(end):
162
+ base = i * 2
163
+ if i == 0:
164
+ mds[base + 0] = "##### User\n" + question["turns"][i]
165
+ else:
166
+ mds[base + 0] = "##### User's follow-up question \n" + question["turns"][i]
167
+ mds[base + 1] = "##### Assistant A\n" + post_process_answer(
168
+ ans["choices"][0]["turns"][i].strip()
169
+ )
170
+
171
+ ref = question.get("reference", ["", ""])
172
+
173
+ ref_md = ""
174
+ if turn is None:
175
+ if ref[0] != "" or ref[1] != "":
176
+ mds[4] = f"##### Reference Solution\nQ1. {ref[0]}\nQ2. {ref[1]}"
177
+ else:
178
+ x = ref[turn] if turn < len(ref) else ""
179
+ if x:
180
+ mds[4] = f"##### Reference Solution\n{ref[turn]}"
181
+ else:
182
+ mds[4] = ""
183
+ return mds
184
+
185
+
186
+ def build_question_selector_map():
187
+ global question_selector_map, category_selector_map
188
+
189
+ # Build question selector map
190
+ for q in questions:
191
+ preview = f"{q['question_id']}: " + q["turns"][0][:128] + "..."
192
+ question_selector_map[preview] = q
193
+ category_selector_map[q["category"]].append(preview)
194
+
195
+
196
+ def sort_models(models):
197
+ priority = {
198
+ "Llama-2-70b-chat": "aaaa",
199
+ "Llama-2-13b-chat": "aaab",
200
+ "Llama-2-7b-chat": "aaac",
201
+ }
202
+
203
+ models = list(models)
204
+ models.sort(key=lambda x: priority.get(x, x))
205
+ return models
206
+
207
+
208
+ def build_pairwise_browser_tab():
209
+ global question_selector_map, category_selector_map
210
+
211
+ models = sort_models(list(model_answers.keys()))
212
+ num_sides = 2
213
+ num_turns = 2
214
+ side_names = ["A", "B"]
215
+
216
+ question_selector_choices = list(question_selector_map.keys())
217
+ category_selector_choices = list(category_selector_map.keys())
218
+
219
+ # Selectors
220
+ with gr.Row():
221
+ with gr.Column(scale=1, min_width=200):
222
+ category_selector = gr.Dropdown(
223
+ choices=category_selector_choices, label="Category", container=False
224
+ )
225
+ with gr.Column(scale=100):
226
+ question_selector = gr.Dropdown(
227
+ choices=question_selector_choices, label="Question", container=False
228
+ )
229
+
230
+ model_selectors = [None] * num_sides
231
+ with gr.Row():
232
+ for i in range(num_sides):
233
+ with gr.Column():
234
+ if i == 0:
235
+ value = models[0]
236
+ else:
237
+ value = "gpt-3.5-turbo"
238
+ model_selectors[i] = gr.Dropdown(
239
+ choices=models,
240
+ value=value,
241
+ label=f"Model {side_names[i]}",
242
+ container=False,
243
+ )
244
+
245
+ # Conversation
246
+ chat_mds = []
247
+ for i in range(num_turns):
248
+ chat_mds.append(gr.Markdown(elem_id=f"user_question_{i+1}"))
249
+ with gr.Row():
250
+ for j in range(num_sides):
251
+ with gr.Column(scale=100):
252
+ chat_mds.append(gr.Markdown())
253
+
254
+ if j == 0:
255
+ with gr.Column(scale=1, min_width=8):
256
+ gr.Markdown()
257
+ reference = gr.Markdown(elem_id=f"reference")
258
+ chat_mds.append(reference)
259
+
260
+ model_explanation = gr.Markdown(elem_id="model_explanation")
261
+ model_explanation2 = gr.Markdown(elem_id="model_explanation")
262
+
263
+ # Callbacks
264
+ category_selector.change(display_question, [category_selector], [question_selector])
265
+ question_selector.change(
266
+ display_pairwise_answer,
267
+ [question_selector] + model_selectors,
268
+ chat_mds + [model_explanation] + [model_explanation2],
269
+ )
270
+
271
+ for i in range(num_sides):
272
+ model_selectors[i].change(
273
+ display_pairwise_answer,
274
+ [question_selector] + model_selectors,
275
+ chat_mds + [model_explanation] + [model_explanation2],
276
+ )
277
+
278
+ return (category_selector,)
279
+
280
+
281
+ def build_single_answer_browser_tab():
282
+ global question_selector_map, category_selector_map
283
+
284
+ models = sort_models(list(model_answers.keys()))
285
+ num_sides = 1
286
+ num_turns = 2
287
+ side_names = ["A"]
288
+
289
+ question_selector_choices = list(question_selector_map.keys())
290
+ category_selector_choices = list(category_selector_map.keys())
291
+
292
+ # Selectors
293
+ with gr.Row():
294
+ with gr.Column(scale=1, min_width=200):
295
+ category_selector = gr.Dropdown(
296
+ choices=category_selector_choices, label="Category", container=False
297
+ )
298
+ with gr.Column(scale=100):
299
+ question_selector = gr.Dropdown(
300
+ choices=question_selector_choices, label="Question", container=False
301
+ )
302
+
303
+ model_selectors = [None] * num_sides
304
+ with gr.Row():
305
+ for i in range(num_sides):
306
+ with gr.Column():
307
+ model_selectors[i] = gr.Dropdown(
308
+ choices=models,
309
+ value=models[i] if len(models) > i else "",
310
+ label=f"Model {side_names[i]}",
311
+ container=False,
312
+ )
313
+
314
+ # Conversation
315
+ chat_mds = []
316
+ for i in range(num_turns):
317
+ chat_mds.append(gr.Markdown(elem_id=f"user_question_{i+1}"))
318
+ with gr.Row():
319
+ for j in range(num_sides):
320
+ with gr.Column(scale=100):
321
+ chat_mds.append(gr.Markdown())
322
+
323
+ if j == 0:
324
+ with gr.Column(scale=1, min_width=8):
325
+ gr.Markdown()
326
+
327
+ reference = gr.Markdown(elem_id=f"reference")
328
+ chat_mds.append(reference)
329
+
330
+ model_explanation = gr.Markdown(elem_id="model_explanation")
331
+ model_explanation2 = gr.Markdown(elem_id="model_explanation")
332
+
333
+ # Callbacks
334
+ category_selector.change(display_question, [category_selector], [question_selector])
335
+ question_selector.change(
336
+ display_single_answer,
337
+ [question_selector] + model_selectors,
338
+ chat_mds + [model_explanation] + [model_explanation2],
339
+ )
340
+
341
+ for i in range(num_sides):
342
+ model_selectors[i].change(
343
+ display_single_answer,
344
+ [question_selector] + model_selectors,
345
+ chat_mds + [model_explanation] + [model_explanation2],
346
+ )
347
+
348
+ return (category_selector,)
349
+
350
+
351
+ block_css = """
352
+ #user_question_1 {
353
+ background-color: #DEEBF7;
354
+ }
355
+ #user_question_2 {
356
+ background-color: #E2F0D9;
357
+ }
358
+ #reference {
359
+ background-color: #FFF2CC;
360
+ }
361
+ #model_explanation {
362
+ background-color: #FBE5D6;
363
+ }
364
+ """
365
+
366
+
367
+ def load_demo():
368
+ dropdown_update = gr.Dropdown.update(value=list(category_selector_map.keys())[0])
369
+ return dropdown_update, dropdown_update
370
+
371
+
372
+ def build_demo():
373
+ build_question_selector_map()
374
+
375
+ with gr.Blocks(
376
+ title="MT-Bench Browser",
377
+ theme=gr.themes.Base(text_size=gr.themes.sizes.text_lg),
378
+ css=block_css,
379
+ ) as demo:
380
+ gr.Markdown(
381
+ """
382
+ # MT-Bench Browser
383
+ | [Paper](https://arxiv.org/abs/2306.05685) | [Code](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) | [Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) |
384
+ """
385
+ )
386
+ with gr.Tab("Single Answer Grading"):
387
+ (category_selector,) = build_single_answer_browser_tab()
388
+ with gr.Tab("Pairwise Comparison"):
389
+ (category_selector2,) = build_pairwise_browser_tab()
390
+ demo.load(load_demo, [], [category_selector, category_selector2])
391
+
392
+ return demo
393
+
394
+
395
+ if __name__ == "__main__":
396
+ parser = argparse.ArgumentParser()
397
+ parser.add_argument("--host", type=str, default="0.0.0.0")
398
+ parser.add_argument("--port", type=int)
399
+ parser.add_argument("--share", action="store_true")
400
+ parser.add_argument("--bench-name", type=str, default="mt_bench")
401
+ args = parser.parse_args()
402
+ print(args)
403
+
404
+ question_file = f"data/{args.bench_name}/question.jsonl"
405
+ answer_dir = f"data/{args.bench_name}/model_answer"
406
+ pairwise_model_judgment_file = (
407
+ f"data/{args.bench_name}/model_judgment/gpt-4_pair.jsonl"
408
+ )
409
+ single_model_judgment_file = (
410
+ f"data/{args.bench_name}/model_judgment/gpt-4_single.jsonl"
411
+ )
412
+
413
+ # Load questions
414
+ questions = load_questions(question_file, None, None)
415
+
416
+ # Load answers
417
+ model_answers = load_model_answers(answer_dir)
418
+
419
+ # Load model judgments
420
+ model_judgments_normal_single = (
421
+ model_judgments_math_single
422
+ ) = load_single_model_judgments(single_model_judgment_file)
423
+ model_judgments_normal_pairwise = (
424
+ model_judgments_math_pairwise
425
+ ) = load_pairwise_model_judgments(pairwise_model_judgment_file)
426
+
427
+ demo = build_demo()
428
+ demo.launch(
429
+ server_name=args.host, server_port=args.port, share=args.share, max_threads=200
430
+ )
common.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common data structures and utilities.
3
+ """
4
+
5
+ import ast
6
+ import dataclasses
7
+ import glob
8
+ import json
9
+ import os
10
+ import re
11
+ import time
12
+ from typing import Optional
13
+
14
+ # API setting constants
15
+ API_MAX_RETRY = 16
16
+ API_RETRY_SLEEP = 10
17
+ API_ERROR_OUTPUT = "$ERROR$"
18
+
19
+ TIE_DELTA = 0.1
20
+
21
+ # Categories that need reference answers
22
+ NEED_REF_CATS = ["math", "reasoning", "coding"]
23
+
24
+ # Extract scores from judgments
25
+ two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
26
+ two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
27
+ one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
28
+ one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
29
+
30
+ # Sampling temperature configs for
31
+ temperature_config = {
32
+ "writing": 0.7,
33
+ "roleplay": 0.7,
34
+ "extraction": 0.0,
35
+ "math": 0.0,
36
+ "coding": 0.0,
37
+ "reasoning": 0.0,
38
+ "stem": 0.1,
39
+ "humanities": 0.1,
40
+ }
41
+
42
+ reverse_model_map = {
43
+ "model_1": "model_2",
44
+ "model_2": "model_1",
45
+ }
46
+
47
+
48
+ @dataclasses.dataclass
49
+ class Judge:
50
+ model_name: str
51
+ prompt_template: dict
52
+ ref_based: bool = False
53
+ multi_turn: bool = False
54
+
55
+
56
+ @dataclasses.dataclass
57
+ class MatchSingle:
58
+ question: dict
59
+ model: str
60
+ answer: dict
61
+ judge: Judge
62
+ ref_answer: dict = None
63
+ multi_turn: bool = False
64
+
65
+
66
+ @dataclasses.dataclass
67
+ class MatchPair:
68
+ question: dict
69
+ model_1: str
70
+ model_2: str
71
+ answer_1: dict
72
+ answer_2: dict
73
+ judge: Judge
74
+ ref_answer: dict = None
75
+ multi_turn: bool = False
76
+
77
+
78
+ def load_questions(question_file: str, begin: Optional[int], end: Optional[int]):
79
+ """Load questions from a file."""
80
+ questions = []
81
+ with open(question_file, "r") as ques_file:
82
+ for line in ques_file:
83
+ if line:
84
+ questions.append(json.loads(line))
85
+ questions = questions[begin:end]
86
+ return questions
87
+
88
+
89
+ def load_model_answers(answer_dir: str):
90
+ """Load model answers.
91
+
92
+ The return value is a python dict of type:
93
+ Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
94
+ """
95
+ filenames = glob.glob(os.path.join(answer_dir, "*.jsonl"))
96
+ filenames.sort()
97
+ model_answers = {}
98
+
99
+ for filename in filenames:
100
+ model_name = os.path.basename(filename)[:-6]
101
+ answer = {}
102
+ with open(filename) as fin:
103
+ for line in fin:
104
+ line = json.loads(line)
105
+ answer[line["question_id"]] = line
106
+ model_answers[model_name] = answer
107
+
108
+ return model_answers
109
+
110
+
111
+ def load_judge_prompts(prompt_file: str):
112
+ """Load judge prompts.
113
+
114
+ The return value is a python dict of type:
115
+ Dict[judge_name: str -> dict]
116
+ """
117
+ prompts = {}
118
+ with open(prompt_file) as fin:
119
+ for line in fin:
120
+ line = json.loads(line)
121
+ prompts[line["name"]] = line
122
+ return prompts
123
+
124
+
125
+ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
126
+ kwargs = {}
127
+ model = judge.model_name
128
+ if ref_answer is not None:
129
+ kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
130
+ kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
131
+
132
+ if multi_turn:
133
+ user_prompt = judge.prompt_template["prompt_template"].format(
134
+ question_1=question["turns"][0],
135
+ question_2=question["turns"][1],
136
+ answer_1=answer["choices"][0]["turns"][0],
137
+ answer_2=answer["choices"][0]["turns"][1],
138
+ **kwargs,
139
+ )
140
+ else:
141
+ user_prompt = judge.prompt_template["prompt_template"].format(
142
+ question=question["turns"][0],
143
+ answer=answer["choices"][0]["turns"][0],
144
+ **kwargs,
145
+ )
146
+
147
+ rating = -1
148
+
149
+ system_prompt = judge.prompt_template["system_prompt"]
150
+ conv = get_conversation_template(model)
151
+ conv.system = system_prompt
152
+ conv.append_message(conv.roles[0], user_prompt)
153
+ conv.append_message(conv.roles[1], None)
154
+
155
+ if model in ["gpt-3.5-turbo", "gpt-4"]:
156
+ judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
157
+ elif model in ["claude-v1", "claude-instant-v1"]:
158
+ judgment = chat_compeletion_anthropic(
159
+ model, conv, temperature=0, max_tokens=1024
160
+ )
161
+ else:
162
+ raise ValueError(f"Invalid judge model name: {model}")
163
+
164
+ if judge.prompt_template["output_format"] == "[[rating]]":
165
+ match = re.search(one_score_pattern, judgment)
166
+ if not match:
167
+ match = re.search(one_score_pattern_backup, judgment)
168
+
169
+ if match:
170
+ rating = ast.literal_eval(match.groups()[0])
171
+ else:
172
+ rating = -1
173
+ else:
174
+ raise ValueError(
175
+ f"invalid output format: {judge.prompt_template['output_format']}"
176
+ )
177
+
178
+ return rating, user_prompt, judgment
179
+
180
+
181
+ def play_a_match_single(match: MatchPair, output_file: str):
182
+ question, model, answer, judge, ref_answer, multi_turn = (
183
+ match.question,
184
+ match.model,
185
+ match.answer,
186
+ match.judge,
187
+ match.ref_answer,
188
+ match.multi_turn,
189
+ )
190
+
191
+ if judge.prompt_template["type"] == "single":
192
+ score, user_prompt, judgment = run_judge_single(
193
+ question, answer, judge, ref_answer, multi_turn=multi_turn
194
+ )
195
+
196
+ question_id = question["question_id"]
197
+ turn = 1 if not multi_turn else 2
198
+ result = {
199
+ "question_id": question_id,
200
+ "model": model,
201
+ "judge": (judge.model_name, judge.prompt_template["name"]),
202
+ "user_prompt": user_prompt,
203
+ "judgment": judgment,
204
+ "score": score,
205
+ "turn": turn,
206
+ "tstamp": time.time(),
207
+ }
208
+ print(
209
+ f"question: {question_id}, turn: {turn}, model: {model}, "
210
+ f"score: {score}, "
211
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
212
+ )
213
+ else:
214
+ raise ValueError(f"invalid judge type: {judge['type']}")
215
+
216
+ if output_file:
217
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
218
+ with open(output_file, "a") as fout:
219
+ fout.write(json.dumps(result) + "\n")
220
+
221
+ return result
222
+
223
+
224
+ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False):
225
+ kwargs = {}
226
+ model = judge.model_name
227
+ if ref_answer is not None:
228
+ kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
229
+ kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
230
+
231
+ if multi_turn:
232
+ system_prompt = judge.prompt_template["system_prompt"]
233
+ user_prompt = judge.prompt_template["prompt_template"].format(
234
+ question_1=question["turns"][0],
235
+ question_2=question["turns"][1],
236
+ answer_a_1=answer_a["choices"][0]["turns"][0],
237
+ answer_b_1=answer_b["choices"][0]["turns"][0],
238
+ answer_a_2=answer_a["choices"][0]["turns"][1],
239
+ answer_b_2=answer_b["choices"][0]["turns"][1],
240
+ **kwargs,
241
+ )
242
+ else:
243
+ system_prompt = judge.prompt_template["system_prompt"]
244
+ user_prompt = judge.prompt_template["prompt_template"].format(
245
+ question=question["turns"][0],
246
+ answer_a=answer_a["choices"][0]["turns"][0],
247
+ answer_b=answer_b["choices"][0]["turns"][0],
248
+ **kwargs,
249
+ )
250
+
251
+ winner = "error"
252
+
253
+ conv = get_conversation_template(model)
254
+ conv.append_message(conv.roles[0], user_prompt)
255
+ conv.append_message(conv.roles[1], None)
256
+
257
+ if model in ["gpt-3.5-turbo", "gpt-4"]:
258
+ conv.system = system_prompt
259
+ judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
260
+ elif model in ["claude-v1", "claude-instant-v1"]:
261
+ if system_prompt != "You are a helpful assistant.":
262
+ user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
263
+ conv.messages[0][1] = user_prompt
264
+ judgment = chat_compeletion_anthropic(
265
+ model, conv, temperature=0, max_tokens=1024
266
+ )
267
+ else:
268
+ raise ValueError(f"Invalid judge model name: {model}")
269
+
270
+ if judge.prompt_template["output_format"] == "[[A]]":
271
+ if "[[A]]" in judgment:
272
+ winner = "A"
273
+ elif "[[B]]" in judgment:
274
+ winner = "B"
275
+ elif "[[C]]" in judgment:
276
+ winner = "tie"
277
+ else:
278
+ winner = "error"
279
+ elif judge.prompt_template["output_format"] == "[[rating_a,rating_b]]":
280
+ match = re.search(two_score_pattern, judgment)
281
+ if not match:
282
+ match = re.search(two_score_pattern_backup, judgment)
283
+ if match:
284
+ scores = [ast.literal_eval(s.strip()) for s in match.groups()]
285
+ if abs(scores[0] - scores[1]) <= TIE_DELTA:
286
+ winner = "tie"
287
+ elif scores[0] > scores[1]:
288
+ winner = "A"
289
+ else:
290
+ winner = "B"
291
+ else:
292
+ winner = "error"
293
+ else:
294
+ raise ValueError(
295
+ f"invalid output format: {judge.prompt_template['output_format']}"
296
+ )
297
+
298
+ return winner, user_prompt, judgment
299
+
300
+
301
+ def play_a_match_pair(match: MatchPair, output_file: str):
302
+ question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = (
303
+ match.question,
304
+ match.model_1,
305
+ match.model_2,
306
+ match.answer_1,
307
+ match.answer_2,
308
+ match.judge,
309
+ match.ref_answer,
310
+ match.multi_turn,
311
+ )
312
+
313
+ if judge.prompt_template["type"] == "pairwise":
314
+ g1_winner, g1_user_prompt, g1_judgment = run_judge_pair(
315
+ question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn
316
+ )
317
+ g2_winner, g2_user_prompt, g2_judgment = run_judge_pair(
318
+ question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn
319
+ )
320
+
321
+ g1_map = {"A": "model_1", "B": "model_2"}
322
+ g2_map = {"A": "model_2", "B": "model_1"}
323
+ g1_winner = g1_map.get(g1_winner, g1_winner)
324
+ g2_winner = g2_map.get(g2_winner, g2_winner)
325
+ question_id = question["question_id"]
326
+ turn = 1 if not multi_turn else 2
327
+
328
+ result = {
329
+ "question_id": question_id,
330
+ "model_1": model_1,
331
+ "model_2": model_2,
332
+ "g1_winner": g1_winner,
333
+ "g2_winner": g2_winner,
334
+ "judge": (judge.model_name, judge.prompt_template["name"]),
335
+ "g1_user_prompt": g1_user_prompt,
336
+ "g1_judgment": g1_judgment,
337
+ "g2_user_prompt": g2_user_prompt,
338
+ "g2_judgment": g2_judgment,
339
+ "turn": turn,
340
+ "tstamp": time.time(),
341
+ }
342
+
343
+ print(
344
+ f"question: {question_id}, turn: {turn}, model_1: {model_1}, model_2: {model_2}, "
345
+ f"g1_winner: {g1_winner}, g2_winner: {g2_winner}, "
346
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
347
+ )
348
+ elif judge.prompt_template["type"] == "single":
349
+ m1_score, m1_user_prompt, m1_judgment = run_judge_single(
350
+ question, answer_1, judge
351
+ )
352
+ m2_score, m2_user_prompt, m2_judgment = run_judge_single(
353
+ question, answer_2, judge
354
+ )
355
+
356
+ if abs(m1_score - m2_score) <= TIE_DELTA:
357
+ winner = "tie"
358
+ elif m1_score > m2_score:
359
+ winner = "model_1"
360
+ else:
361
+ winner = "model_2"
362
+
363
+ question_id = question["question_id"]
364
+ result = {
365
+ "question_id": question_id,
366
+ "model_1": model_1,
367
+ "model_2": model_2,
368
+ "g1_winner": winner,
369
+ "g2_winner": winner,
370
+ "judge": (judge.model_name, judge.prompt_template["name"]),
371
+ "g1_user_prompt": m1_user_prompt,
372
+ "g1_judgment": m1_judgment,
373
+ "g2_user_prompt": m2_user_prompt,
374
+ "g2_judgment": m2_judgment,
375
+ "m1_score": m1_score,
376
+ "m2_score": m2_score,
377
+ "tstamp": time.time(),
378
+ }
379
+ print(
380
+ f"question: {question_id}, model_1: {model_1}, model_2: {model_2}, "
381
+ f"winner: {winner}, m1_score: {m1_score}, m2_score: {m2_score}, "
382
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
383
+ )
384
+ else:
385
+ raise ValueError(f"invalid judge type: {judge['type']}")
386
+
387
+ if output_file:
388
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
389
+ with open(output_file, "a") as fout:
390
+ fout.write(json.dumps(result) + "\n")
391
+
392
+ return result
393
+
394
+
395
+ def chat_compeletion_openai(model, conv, temperature, max_tokens):
396
+ output = API_ERROR_OUTPUT
397
+ for _ in range(API_MAX_RETRY):
398
+ try:
399
+ messages = conv.to_openai_api_messages()
400
+ response = openai.ChatCompletion.create(
401
+ model=model,
402
+ messages=messages,
403
+ n=1,
404
+ temperature=temperature,
405
+ max_tokens=max_tokens,
406
+ )
407
+ output = response["choices"][0]["message"]["content"]
408
+ break
409
+ except openai.error.OpenAIError as e:
410
+ print(type(e), e)
411
+ time.sleep(API_RETRY_SLEEP)
412
+
413
+ return output
414
+
415
+
416
+ def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
417
+ output = API_ERROR_OUTPUT
418
+ for _ in range(API_MAX_RETRY):
419
+ try:
420
+ c = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
421
+ prompt = conv.get_prompt()
422
+ response = c.completion(
423
+ model=model,
424
+ prompt=prompt,
425
+ stop_sequences=[anthropic.HUMAN_PROMPT],
426
+ max_tokens_to_sample=max_tokens,
427
+ temperature=temperature,
428
+ )
429
+ output = response["completion"]
430
+ break
431
+ except anthropic.ApiException as e:
432
+ print(type(e), e)
433
+ time.sleep(API_RETRY_SLEEP)
434
+ return output.strip()
435
+
436
+
437
+ def chat_compeletion_palm(chat_state, model, conv, temperature, max_tokens):
438
+ from fastchat.serve.api_provider import init_palm_chat
439
+
440
+ assert model == "palm-2-chat-bison-001"
441
+
442
+ if chat_state is None:
443
+ chat_state = init_palm_chat("chat-bison@001")
444
+
445
+ parameters = {
446
+ "temperature": temperature,
447
+ "top_p": 0.8,
448
+ "top_k": 40,
449
+ "max_output_tokens": max_tokens,
450
+ }
451
+ output = API_ERROR_OUTPUT
452
+ for _ in range(API_MAX_RETRY):
453
+ try:
454
+ response = chat_state.send_message(conv.messages[-2][1], **parameters)
455
+ output = response.text
456
+ break
457
+ except Exception as e:
458
+ print(type(e), e)
459
+ time.sleep(API_RETRY_SLEEP)
460
+ return chat_state, output
461
+
462
+
463
+ def normalize_game_key_single(gamekey, result):
464
+ """Make the model names sorted in a game key."""
465
+ qid, model_1, model_2 = gamekey
466
+ if model_1 < model_2:
467
+ return gamekey, result
468
+ else:
469
+ new_gamekey = (qid, model_2, model_1)
470
+ new_result = {
471
+ "winners": tuple(reverse_model_map.get(x, x) for x in result["winners"]),
472
+ "g1_judgment": result["g2_judgment"],
473
+ "g2_judgment": result["g1_judgment"],
474
+ }
475
+ return new_gamekey, new_result
476
+
477
+
478
+ def normalize_game_key_dict(judgment_dict):
479
+ """Make the model names sorted in the game keys."""
480
+ ret = {}
481
+ for key, value in judgment_dict.items():
482
+ new_key, new_value = normalize_game_key_single(key, value)
483
+ ret[new_key] = new_value
484
+ return ret
485
+
486
+
487
+ def load_pairwise_model_judgments(filename: str):
488
+ """Load model judgments.
489
+
490
+ The return value is a dict of type:
491
+ Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
492
+ """
493
+ judge_dict = {}
494
+
495
+ for line in open(filename):
496
+ obj = json.loads(line)
497
+ judge = tuple(obj["judge"])
498
+ qid, model_1, model_2 = obj["question_id"], obj["model_1"], obj["model_2"]
499
+
500
+ if judge not in judge_dict:
501
+ judge_dict[judge] = {}
502
+
503
+ if "winner" in obj:
504
+ winner = obj["winner"]
505
+ elif "g1_winner" in obj and "g2_winner" in obj:
506
+ g1_winner, g2_winner = obj["g1_winner"], obj["g2_winner"]
507
+ if g1_winner == g2_winner:
508
+ winner = g1_winner
509
+ else:
510
+ winner = "inconsistent"
511
+ else:
512
+ raise ValueError(f"Invalid keys: {list(obj.keys())}")
513
+
514
+ gamekey = (qid, model_1, model_2)
515
+ winners = (winner,)
516
+
517
+ judge_dict[judge][gamekey] = {
518
+ "winners": winners,
519
+ "g1_judgment": obj["g1_judgment"],
520
+ "g2_judgment": obj["g2_judgment"],
521
+ }
522
+
523
+ # Make the model names sorted in the game keys
524
+ normalized = {}
525
+ for judge, value in judge_dict.items():
526
+ normalized[judge] = normalize_game_key_dict(value)
527
+ return normalized
528
+
529
+
530
+ def load_single_model_judgments(filename: str):
531
+ """Load model judgments.
532
+
533
+ The return value is a dict of type:
534
+ Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
535
+ """
536
+ judge_dict = {}
537
+
538
+ for line in open(filename):
539
+ obj = json.loads(line)
540
+ judge = tuple(obj["judge"])
541
+ qid, model = obj["question_id"], obj["model"]
542
+
543
+ if judge not in judge_dict:
544
+ judge_dict[judge] = {}
545
+
546
+ gamekey = (qid, model)
547
+
548
+ judge_dict[judge][gamekey] = {
549
+ "score": obj["score"],
550
+ "judgment": obj["judgment"],
551
+ }
552
+ return judge_dict
553
+
554
+
555
+ def resolve_pairwise_judgment_dict(
556
+ question, model_judgments_normal, model_judgments_math, multi_turn=False
557
+ ):
558
+ """Return the correct pairwise judge."""
559
+ if multi_turn:
560
+ if question["category"] in NEED_REF_CATS:
561
+ return model_judgments_math[("gpt-4", "pair-math-v1-multi-turn")]
562
+ return model_judgments_normal[("gpt-4", "pair-v2-multi-turn")]
563
+
564
+ if question["category"] in NEED_REF_CATS:
565
+ return model_judgments_math[("gpt-4", "pair-math-v1")]
566
+ else:
567
+ return model_judgments_normal[("gpt-4", "pair-v2")]
568
+
569
+
570
+ def resolve_single_judgment_dict(
571
+ question, model_judgments_normal, model_judgments_math, multi_turn=False
572
+ ):
573
+ """Return the correct single answer grading judge."""
574
+ if multi_turn:
575
+ if question["category"] in NEED_REF_CATS:
576
+ return model_judgments_math[("gpt-4", "single-math-v1-multi-turn")]
577
+ return model_judgments_normal[("gpt-4", "single-v1-multi-turn")]
578
+
579
+ if question["category"] in NEED_REF_CATS:
580
+ return model_judgments_math[("gpt-4", "single-math-v1")]
581
+ else:
582
+ return model_judgments_normal[("gpt-4", "single-v1")]
583
+
584
+
585
+ def get_pairwise_judge_explanation(gamekey, judgment_dict):
586
+ """Get model judge explanation."""
587
+ try:
588
+ qid, model_1, model_2 = gamekey
589
+ if model_1 < model_2:
590
+ res = judgment_dict[gamekey]
591
+ g1_judgment, g2_judgment = res["g1_judgment"], res["g2_judgment"]
592
+ else:
593
+ new_gamekey = (qid, model_2, model_1)
594
+ res = judgment_dict[new_gamekey]
595
+
596
+ model_1, model_2 = model_1, model_2
597
+ g1_judgment, g2_judgment = res["g2_judgment"], res["g1_judgment"]
598
+
599
+ return (
600
+ f"**Game 1**. **A**: {model_1}, **B**: {model_2}\n\n"
601
+ f"**Judgment**: {g1_judgment}"
602
+ + f"\n\n`--------------------------`\n\n"
603
+ + f"**Game 2**. **A**: {model_2}, **B**: {model_1}\n\n"
604
+ f"**Judgment**: {g2_judgment}"
605
+ )
606
+ except KeyError:
607
+ return "N/A"
608
+
609
+
610
+ def get_single_judge_explanation(gamekey, judgment_dict):
611
+ """Get model judge explanation."""
612
+ try:
613
+ qid, model = gamekey
614
+
615
+ res = judgment_dict[gamekey]
616
+
617
+ g1_judgment = res["judgment"]
618
+ g1_score = res["score"]
619
+
620
+ return (
621
+ f"**Game 1**. **A**: {model}, **Score**: {g1_score}\n\n"
622
+ f"**Judgment**: {g1_judgment}"
623
+ )
624
+ except KeyError:
625
+ return "N/A"
626
+
627
+
628
+ def check_data(questions, model_answers, ref_answers, models, judges):
629
+ # check model answers
630
+ for m in models:
631
+ assert m in model_answers, f"Missing model answer for {m}"
632
+ m_answer = model_answers[m]
633
+ for q in questions:
634
+ assert (
635
+ q["question_id"] in m_answer
636
+ ), f"Missing model {m}'s answer to Question {q['question_id']}"
637
+ # check ref answers
638
+ for jg in judges.values():
639
+ if not jg.ref_based:
640
+ continue
641
+ for q in questions:
642
+ if q["category"] not in NEED_REF_CATS:
643
+ continue
644
+ assert (
645
+ q["question_id"] in ref_answers[jg.model_name]
646
+ ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
647
+
648
+
649
+ def get_model_list(answer_dir):
650
+ file_paths = glob.glob(f"{answer_dir}/*.jsonl")
651
+ file_names = [os.path.splitext(os.path.basename(f))[0] for f in file_paths]
652
+ return file_names