|
|
"""Preprocessing functions for various benchmark datasets. |
|
|
|
|
|
This module provides data loading and prompt formatting functions for: |
|
|
- Math benchmarks: MATH, GSM8K, AIME, Minerva Math, OmniMath, etc. |
|
|
- Coding benchmarks: HumanEval, LiveCodeBench, MBPP |
|
|
- Multiple-choice: MMLU, MMLU Pro, GPQA |
|
|
- Instruction following: IFEval, IFBench, MT-Bench |
|
|
- General: AlpacaEval, Arena-Hard |
|
|
""" |
|
|
|
|
|
import json |
|
|
import pandas |
|
|
import os |
|
|
|
|
|
|
|
|
def preprocess_gpqa_chatml_template(data_file, use_r1=False, think=True): |
|
|
"""Preprocess GPQA dataset with ChatML template formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to GPQA JSON file |
|
|
use_r1: Whether to use DeepSeek R1-style prompting (default: False) |
|
|
think: Whether to enable thinking mode (default: True) |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with ChatML template |
|
|
""" |
|
|
if use_r1: |
|
|
QUERY_TEMPLATE_MULTICHOICE = "{Question}\n\n\nA. {choice1}\nB. {choice2}\nC. {choice3}\nD. {choice4}\n\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{{}} in the end. Let's think step by step and output the final answer within \\boxed{{}}." |
|
|
else: |
|
|
QUERY_TEMPLATE_MULTICHOICE = "Return your final response within \\boxed{{}} and only include the letter choice (e.g., A, B, C, or D) as your final response.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant.<|im_end|>\n" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
choices_dict = dict( |
|
|
Question=item['question'].strip(), |
|
|
choice1=item['choice_A'].strip(), |
|
|
choice2=item['choice_B'].strip(), |
|
|
choice3=item['choice_C'].strip(), |
|
|
choice4=item['choice_D'].strip() |
|
|
) |
|
|
final_question = QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict) |
|
|
if use_r1: |
|
|
final_prompt = """<|begin▁of▁sentence|><|User|>{question}.<|Assistant|><think>\n""".format(question=final_question) |
|
|
else: |
|
|
if think: |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + " /think<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
else: |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + " /no_think<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_gpqa_raw_template(data_file, use_r1=False, think=True): |
|
|
"""Preprocess GPQA dataset with raw (no template) formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to GPQA JSON file |
|
|
use_r1: Whether to use DeepSeek R1-style prompting (default: False) |
|
|
think: Whether to enable thinking mode (default: True) |
|
|
|
|
|
Returns: |
|
|
list: Raw formatted prompts without chat template |
|
|
""" |
|
|
if use_r1: |
|
|
QUERY_TEMPLATE_MULTICHOICE = "{Question}\n\n\nA. {choice1}\nB. {choice2}\nC. {choice3}\nD. {choice4}\n\nPlease reason step-by-step and put your choice letter without any other text with \\boxed{{}} in the end. Let's think step by step and output the final answer within \\boxed{{}}." |
|
|
else: |
|
|
QUERY_TEMPLATE_MULTICHOICE = "Return your final response within \\boxed{{}} and only include the letter choice (e.g., A, B, C, or D) as your final response.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
choices_dict = dict( |
|
|
Question=item['question'].strip(), |
|
|
choice1=item['choice_A'].strip(), |
|
|
choice2=item['choice_B'].strip(), |
|
|
choice3=item['choice_C'].strip(), |
|
|
choice4=item['choice_D'].strip() |
|
|
) |
|
|
final_question = QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict) |
|
|
prompt_list.append(final_question) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_gsm8k_zeroshot_chatml_template(data_file): |
|
|
"""Preprocess GSM8K dataset with zero-shot ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to GSM8K JSON file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with ChatML template and thinking enabled |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
|
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
def preprocess_gsm8k_zeroshot_raw(data_file): |
|
|
"""Preprocess GSM8K dataset with zero-shot raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to GSM8K JSON file |
|
|
|
|
|
Returns: |
|
|
list: Raw question prompts without chat template |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = final_question |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_humaneval_raw(data_file): |
|
|
"""Preprocess HumanEval code generation dataset. |
|
|
|
|
|
Args: |
|
|
data_file: Path to HumanEval JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Code completion prompts with instructions |
|
|
- qid_list: Task IDs |
|
|
""" |
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n" |
|
|
with open(data_file, "r") as f: |
|
|
data_dict = json.load(f) |
|
|
for key, values in data_dict.items(): |
|
|
qid_list.append(key) |
|
|
prompt = instruction + values['prompt'] |
|
|
prompt_list.append(prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_math_zeroshot_chatml_template(data_file): |
|
|
"""Preprocess MATH dataset with zero-shot ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MATH CSV file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with ChatML template and thinking enabled |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
df = pandas.read_csv(data_file) |
|
|
test_list = [row.to_dict() for _, row in df.iterrows()] |
|
|
prompt_list = [] |
|
|
for item in test_list: |
|
|
final_question = item['Question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_math500_zeroshot_chatml_template(data_file, use_r1=False): |
|
|
"""Preprocess MATH-500 dataset with zero-shot prompting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MATH-500 JSONL file |
|
|
use_r1: Whether to use DeepSeek R1-style prompting (default: False) |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
|
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
data_dict = json.loads(line) |
|
|
final_question = data_dict['problem'].strip() |
|
|
if use_r1: |
|
|
final_prompt = """<|begin▁of▁sentence|><|User|>{question}\nPlease reason step by step, and put your final answer within \boxed{{}}.<|Assistant|><think>\n""".format(question=final_question) |
|
|
else: |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_minerva_math_chatml_template(data_file): |
|
|
"""Preprocess Minerva Math dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to Minerva Math JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['problem'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_gaokao2023en_chatml_template(data_file): |
|
|
"""Preprocess Gaokao 2023 English dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to Gaokao 2023 English JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_olympiadbench_chatml_template(data_file): |
|
|
"""Preprocess OlympiadBench dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to OlympiadBench JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_collegemath_chatml_template(data_file): |
|
|
"""Preprocess College Math dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to College Math JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "\n\nPlease place your final answer inside \\boxed{}." + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_aime24_chatml_template(data_file): |
|
|
"""Preprocess AIME 2024 dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AIME 2024 JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with thinking enabled |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_aime25_chatml_template(data_file): |
|
|
"""Preprocess AIME 2025 dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AIME 2025 JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with thinking enabled |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['problem'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_aime24_raw(data_file): |
|
|
"""Preprocess AIME 2024 dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AIME 2024 JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Raw prompts with boxed answer instruction |
|
|
""" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.".format(question=final_question) |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_aime25_raw(data_file): |
|
|
"""Preprocess AIME 2025 dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AIME 2025 JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Raw prompts with boxed answer instruction |
|
|
""" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['problem'].strip() |
|
|
final_prompt = "{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.".format(question=final_question) |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_amc23_chatml_template(data_file): |
|
|
"""Preprocess AMC 2023 dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AMC 2023 JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with thinking enabled |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['question'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_omnimath_chatml_template(data_file): |
|
|
"""Preprocess OmniMath dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to OmniMath JSONL file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with thinking enabled |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
prompt_list = [] |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
final_question = item['problem'].strip() |
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_ifeval_chatml_template(data_file): |
|
|
"""Preprocess IFEval instruction-following dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to IFEval JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Formatted prompts with ChatML template |
|
|
- qid_list: Task IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['key']) |
|
|
first_question = item['prompt'] |
|
|
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_ifeval_raw(data_file): |
|
|
"""Preprocess IFEval instruction-following dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to IFEval JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw instruction prompts |
|
|
- qid_list: Task IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['key']) |
|
|
final_prompt = item['prompt'] |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_ifbench_raw(data_file): |
|
|
"""Preprocess IFBench instruction-following dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to IFBench JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw instruction prompts |
|
|
- qid_list: Task IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['key']) |
|
|
final_prompt = item['prompt'] |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_arena_hard_chatml_template(data_file): |
|
|
"""Preprocess Arena-Hard dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to Arena-Hard JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Formatted prompts with ChatML template |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0]['content'] |
|
|
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
def preprocess_arena_hard_raw(data_file): |
|
|
"""Preprocess Arena-Hard dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to Arena-Hard JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw question prompts |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0]['content'] |
|
|
|
|
|
prompt_list.append(first_question) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
def preprocess_arena_hard_v2_raw(data_file): |
|
|
"""Preprocess Arena-Hard v2.0 dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to Arena-Hard v2.0 JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw question prompts |
|
|
- qid_list: Unique IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['uid']) |
|
|
first_question = item['prompt'] |
|
|
|
|
|
prompt_list.append(first_question) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_alpaca_eval_raw(data_file): |
|
|
"""Preprocess AlpacaEval dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AlpacaEval JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw instruction prompts |
|
|
- qid_list: Sequential question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for i, item in enumerate(data_list): |
|
|
qid_list.append(i) |
|
|
final_prompt = item['instruction'] |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_alpaca_eval_chatml_template(data_file): |
|
|
"""Preprocess AlpacaEval dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to AlpacaEval JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Formatted prompts with ChatML template |
|
|
- qid_list: Sequential question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for i, item in enumerate(data_list): |
|
|
qid_list.append(i) |
|
|
first_question = item['instruction'] |
|
|
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_mtbench_firstturn(data_file): |
|
|
"""Preprocess MT-Bench first turn with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MT-Bench JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: First turn prompts with ChatML template |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0] |
|
|
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mtbench_firstturn_raw(data_file): |
|
|
"""Preprocess MT-Bench first turn with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MT-Bench JSONL file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: First turn raw prompts |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0] |
|
|
|
|
|
prompt_list.append(first_question) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mtbench_secondturn(data_file, output_file): |
|
|
"""Preprocess MT-Bench second turn with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MT-Bench JSONL file |
|
|
output_file: Model output file for the first turn of MT-Bench |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Second turn prompts with conversation history |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
id2output = {} |
|
|
with open(output_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
id2output[item['task_id']] = item['output'] |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0] |
|
|
second_question = item['turns'][1] |
|
|
|
|
|
model_output = id2output[item['question_id']] |
|
|
|
|
|
final_prompt = "<|im_start|>user\n" + first_question + "<|im_end|>\n<|im_start|>assistant\n" + model_output + "<|im_end|>\n" + \ |
|
|
"<|im_start|>user\n" + second_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mtbench_secondturn_raw(data_file, output_file): |
|
|
"""Preprocess MT-Bench second turn with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MT-Bench JSONL file |
|
|
output_file: Model output file for the first turn of MT-Bench |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Second turn prompts as chat message lists |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
with open(data_file, "r") as f: |
|
|
data = f.readlines() |
|
|
data_list = [json.loads(x) for x in data] |
|
|
|
|
|
id2output = {} |
|
|
with open(output_file, "r") as f: |
|
|
for line in f: |
|
|
item = json.loads(line) |
|
|
id2output[item['task_id']] = (item['output'], item['reason_text']) |
|
|
|
|
|
qid_list = [] |
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
qid_list.append(item['question_id']) |
|
|
first_question = item['turns'][0] |
|
|
second_question = item['turns'][1] |
|
|
|
|
|
output, reason = id2output[item['question_id']] |
|
|
model_output = output |
|
|
|
|
|
chat = [ |
|
|
{'role': 'user', 'content': first_question}, |
|
|
{'role': 'assistant', 'content': model_output}, |
|
|
{'role': 'user', 'content': second_question} |
|
|
] |
|
|
|
|
|
prompt_list.append(chat) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_chatml_template(data_file): |
|
|
"""Preprocess MMLU dataset with few-shot ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU CSV file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with few-shot examples and ChatML template |
|
|
""" |
|
|
def _load_mmlu_cot_fewshot_examples(): |
|
|
import yaml |
|
|
current_folder = os.path.dirname(os.path.abspath(__file__)) |
|
|
fewshot_folder = os.path.join(current_folder, "flan_cot_fewshot") |
|
|
file_list = os.listdir(fewshot_folder) |
|
|
fewshot_dict = {} |
|
|
for filename in file_list: |
|
|
with open(os.path.join(fewshot_folder, filename)) as f: |
|
|
data = yaml.safe_load(f) |
|
|
dataset_name = data['dataset_name'].strip() |
|
|
description = data['description'].strip() |
|
|
sample_list = data['fewshot_config']["samples"] |
|
|
|
|
|
prompt = description |
|
|
for sample in sample_list: |
|
|
prompt += "\n\n" |
|
|
prompt += "Q: " + sample['question'].strip() + "\n" + "A: " + sample['target'].strip() |
|
|
|
|
|
fewshot_dict[dataset_name] = prompt |
|
|
|
|
|
return fewshot_dict |
|
|
|
|
|
fewshot_dict = _load_mmlu_cot_fewshot_examples() |
|
|
df = pandas.read_csv(data_file) |
|
|
test_list = [row.to_dict() for _, row in df.iterrows()] |
|
|
|
|
|
prompt_list = [] |
|
|
for item in test_list: |
|
|
subject = item['Subject'] |
|
|
fewshot_prompt = fewshot_dict[subject] |
|
|
question = item['Question'] |
|
|
choice_a = str(item['A']).strip() |
|
|
choice_b = str(item['B']).strip() |
|
|
choice_c = str(item['C']).strip() |
|
|
choice_d = str(item['D']).strip() |
|
|
|
|
|
final_question = fewshot_prompt + "\n\n" + "Q: " + question + "\n" |
|
|
final_question += "(A) " + choice_a + " (B) " + choice_b + " (C) " + choice_c + " (D) " + choice_d + "\n" |
|
|
final_question += "A: " |
|
|
|
|
|
final_prompt = "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_raw_template(data_file): |
|
|
"""Preprocess MMLU dataset with few-shot raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU CSV file |
|
|
|
|
|
Returns: |
|
|
list: Raw prompts with few-shot examples |
|
|
""" |
|
|
def _load_mmlu_cot_fewshot_examples(): |
|
|
import yaml |
|
|
current_folder = os.path.dirname(os.path.abspath(__file__)) |
|
|
fewshot_folder = os.path.join(current_folder, "flan_cot_fewshot") |
|
|
file_list = os.listdir(fewshot_folder) |
|
|
fewshot_dict = {} |
|
|
for filename in file_list: |
|
|
with open(os.path.join(fewshot_folder, filename)) as f: |
|
|
data = yaml.safe_load(f) |
|
|
dataset_name = data['dataset_name'].strip() |
|
|
description = data['description'].strip() |
|
|
sample_list = data['fewshot_config']["samples"] |
|
|
|
|
|
prompt = description |
|
|
for sample in sample_list: |
|
|
prompt += "\n\n" |
|
|
prompt += "Q: " + sample['question'].strip() + "\n" + "A: " + sample['target'].strip() |
|
|
|
|
|
fewshot_dict[dataset_name] = prompt |
|
|
|
|
|
return fewshot_dict |
|
|
|
|
|
fewshot_dict = _load_mmlu_cot_fewshot_examples() |
|
|
df = pandas.read_csv(data_file) |
|
|
test_list = [row.to_dict() for _, row in df.iterrows()] |
|
|
|
|
|
prompt_list = [] |
|
|
for item in test_list: |
|
|
subject = item['Subject'] |
|
|
fewshot_prompt = fewshot_dict[subject] |
|
|
question = item['Question'] |
|
|
choice_a = str(item['A']).strip() |
|
|
choice_b = str(item['B']).strip() |
|
|
choice_c = str(item['C']).strip() |
|
|
choice_d = str(item['D']).strip() |
|
|
|
|
|
final_question = fewshot_prompt + "\n\n" + "Q: " + question + "\n" |
|
|
final_question += "(A) " + choice_a + " (B) " + choice_b + " (C) " + choice_c + " (D) " + choice_d + "\n" |
|
|
final_question += "A: " |
|
|
|
|
|
prompt_list.append(final_question) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
QUERY_TEMPLATE_MULTICHOICE = """ |
|
|
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. |
|
|
|
|
|
{Question} |
|
|
|
|
|
A) {A} |
|
|
B) {B} |
|
|
C) {C} |
|
|
D) {D} |
|
|
""".strip() |
|
|
|
|
|
def preprocess_mmlu_r1_raw_template(data_file): |
|
|
"""Preprocess MMLU dataset for R1-style models with zero-shot prompting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU CSV file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts for R1-style reasoning models |
|
|
""" |
|
|
df = pandas.read_csv(data_file) |
|
|
test_list = [row.to_dict() for _, row in df.iterrows()] |
|
|
|
|
|
prompt_list = [] |
|
|
|
|
|
|
|
|
|
|
|
for item in test_list: |
|
|
subject = item['Subject'] |
|
|
question = item['Question'] |
|
|
choice_a = str(item['A']).strip() |
|
|
choice_b = str(item['B']).strip() |
|
|
choice_c = str(item['C']).strip() |
|
|
choice_d = str(item['D']).strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
final_question = QUERY_TEMPLATE_MULTICHOICE.format(Question=question, A=choice_a, B=choice_b, C=choice_c, D=choice_d) |
|
|
|
|
|
prompt_list.append(final_question) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
def preprocess_mmlu_r1_raw_template_wdai(data_file): |
|
|
"""Preprocess MMLU dataset with boxed answer format (alternative R1 style). |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU CSV file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with boxed answer instruction |
|
|
""" |
|
|
MMLU_QUERY_TEMPLATE_MULTICHOICE = "Answer the following multiple-choice question. At the end of your response, conclude with the sentence `The answer is \\boxed{{X}}.`, replacing X with the correct capital letter of your choice.\n\n{Question}\n\nAnswer Choices:\n(A) {choice1}\n(B) {choice2}\n(C) {choice3}\n(D) {choice4}" |
|
|
df = pandas.read_csv(data_file) |
|
|
test_list = [row.to_dict() for _, row in df.iterrows()] |
|
|
|
|
|
prompt_list = [] |
|
|
for item in test_list: |
|
|
choices_dict = dict( |
|
|
Question=item['Question'].strip(), |
|
|
choice1=str(item['A']).strip(), |
|
|
choice2=str(item['B']).strip(), |
|
|
choice3=str(item['C']).strip(), |
|
|
choice4=str(item['D']).strip() |
|
|
) |
|
|
final_question = MMLU_QUERY_TEMPLATE_MULTICHOICE.format(**choices_dict) |
|
|
|
|
|
prompt_list.append(final_question) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_pro_chatml_template(data_file, fewshot_file): |
|
|
"""Preprocess MMLU-Pro dataset with 5-shot ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU-Pro test JSON file |
|
|
fewshot_file: Path to MMLU-Pro validation JSON file (for few-shot examples) |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with 5-shot examples and ChatML template |
|
|
""" |
|
|
def _preprocess(data_list): |
|
|
output_list = [] |
|
|
for item in data_list: |
|
|
options = [] |
|
|
for opt in item["options"]: |
|
|
if opt == "N/A": |
|
|
continue |
|
|
options.append(opt) |
|
|
item["options"] = options |
|
|
output_list.append(item) |
|
|
return output_list |
|
|
|
|
|
def _categorize_basedon_subject(data_list): |
|
|
fewshot_dict = {} |
|
|
for item in data_list: |
|
|
subject = item['category'] |
|
|
if subject in fewshot_dict: |
|
|
fewshot_dict[subject].append(item) |
|
|
else: |
|
|
fewshot_dict[subject] = [item] |
|
|
return fewshot_dict |
|
|
|
|
|
def _format_each_sample(sample, is_test): |
|
|
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"] |
|
|
|
|
|
question = sample['question'] |
|
|
options = sample['options'] |
|
|
|
|
|
sample_prompt = "Q: " + question + "\n" |
|
|
for i, opt in enumerate(options): |
|
|
sample_prompt += "(%s) %s " % (choices[i], opt) |
|
|
sample_prompt = sample_prompt.strip() + "\n" |
|
|
|
|
|
if is_test: |
|
|
sample_prompt += "A: " |
|
|
else: |
|
|
sample_prompt += sample['cot_content'].strip() |
|
|
|
|
|
return sample_prompt |
|
|
|
|
|
def _get_fewshot_prompt(fewshot_samples, test_sample, subject): |
|
|
description = "The following are multiple choice questions (with answers) about %s." % subject |
|
|
|
|
|
final_question = description + "\n\n" |
|
|
for sample in fewshot_samples: |
|
|
sample_prompt = _format_each_sample(sample, is_test=False) |
|
|
final_question += sample_prompt + "\n\n" |
|
|
|
|
|
test_prompt = _format_each_sample(test_sample, is_test=True) |
|
|
final_question += test_prompt |
|
|
|
|
|
return final_question |
|
|
|
|
|
with open(fewshot_file, "r") as f: |
|
|
fewshot_list = json.load(f) |
|
|
with open(data_file, "r") as f: |
|
|
test_list = json.load(f) |
|
|
|
|
|
fewshot_list = _preprocess(fewshot_list) |
|
|
fewshot_dict = _categorize_basedon_subject(fewshot_list) |
|
|
test_list = _preprocess(test_list) |
|
|
|
|
|
prompt_list = [] |
|
|
for test_sample in test_list: |
|
|
subject = test_sample['category'] |
|
|
fewshot_samples = fewshot_dict[subject] |
|
|
|
|
|
assert len(fewshot_samples) == 5 |
|
|
final_question = _get_fewshot_prompt(fewshot_samples, test_sample, subject) |
|
|
final_prompt = "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_pro_zero_shot_chatml_template(data_file, think=True): |
|
|
"""Preprocess MMLU-Pro dataset with zero-shot ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU-Pro test JSON file |
|
|
think: Whether to enable thinking mode (default: True) |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with ChatML template and boxed answer instruction |
|
|
""" |
|
|
def _preprocess(data_list): |
|
|
output_list = [] |
|
|
for item in data_list: |
|
|
options = [] |
|
|
for opt in item["options"]: |
|
|
if opt == "N/A": |
|
|
continue |
|
|
options.append(opt) |
|
|
item["options"] = options |
|
|
output_list.append(item) |
|
|
return output_list |
|
|
|
|
|
def _format_each_sample(sample): |
|
|
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"] |
|
|
|
|
|
question = sample['question'] |
|
|
options = sample['options'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_prompt = "Question:\n" + question + "\n\nAnswer Choices:" |
|
|
for i, opt in enumerate(options): |
|
|
sample_prompt += "\n(%s) %s" % (choices[i], opt) |
|
|
sample_prompt += "\n\nConclude your response with the sentence `The answer is \\boxed{{X}}.`, in which X is the correct capital letter of your choice." |
|
|
sample_prompt = sample_prompt.strip() + "\n" |
|
|
|
|
|
return sample_prompt |
|
|
|
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant.<|im_end|>\n" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
test_list = json.load(f) |
|
|
test_list = _preprocess(test_list) |
|
|
|
|
|
prompt_list = [] |
|
|
for test_sample in test_list: |
|
|
test_prompt = _format_each_sample(test_sample) |
|
|
if think: |
|
|
final_prompt = instruction + "<|im_start|>user\n" + test_prompt + "\n /think<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
else: |
|
|
final_prompt = instruction + "<|im_start|>user\n" + test_prompt + "\n /no_think<|im_end|>\n<|im_start|>assistant\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_pro_zero_shot_raw_template(data_file, think=True): |
|
|
"""Preprocess MMLU-Pro dataset with zero-shot raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU-Pro test JSON file |
|
|
think: Whether to enable thinking mode (default: True, currently unused) |
|
|
|
|
|
Returns: |
|
|
list: Raw prompts with boxed answer instruction |
|
|
""" |
|
|
def _preprocess(data_list): |
|
|
output_list = [] |
|
|
for item in data_list: |
|
|
options = [] |
|
|
for opt in item["options"]: |
|
|
if opt == "N/A": |
|
|
continue |
|
|
options.append(opt) |
|
|
item["options"] = options |
|
|
output_list.append(item) |
|
|
return output_list |
|
|
|
|
|
def _format_each_sample(sample): |
|
|
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"] |
|
|
|
|
|
question = sample['question'] |
|
|
options = sample['options'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sample_prompt = "Question:\n" + question + "\n\nAnswer Choices:" |
|
|
for i, opt in enumerate(options): |
|
|
sample_prompt += "\n(%s) %s" % (choices[i], opt) |
|
|
sample_prompt += "\n\nConclude your response with the sentence `The answer is \\boxed{{X}}.`, in which X is the correct capital letter of your choice." |
|
|
sample_prompt = sample_prompt.strip() + "\n" |
|
|
|
|
|
return sample_prompt |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
test_list = json.load(f) |
|
|
test_list = _preprocess(test_list) |
|
|
|
|
|
prompt_list = [] |
|
|
for test_sample in test_list: |
|
|
test_prompt = _format_each_sample(test_sample) |
|
|
prompt_list.append(test_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|
|
|
def preprocess_livecodebench_chatml_template(data_file): |
|
|
"""Preprocess LiveCodeBench dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to LiveCodeBench JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Formatted coding prompts with ChatML template |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
|
|
|
code_instruction_nostartercode = """Write Python code to solve the problem. Please place the solution code in the following format:\n```python\n# Your solution code here\n```""" |
|
|
code_instruction_hasstartercode = """Please place the solution code in the following format:\n```python\n# Your solution code here\n```""" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
qid_list = [] |
|
|
for item in data_list: |
|
|
question = item['question_content'].strip() |
|
|
if item['starter_code'] != "": |
|
|
question += "\n\n" + "Solve the problem starting with the provided function header.\n\nFunction header:\n" + "```\n" + item['starter_code'] + "\n```" |
|
|
question += "\n\n" + code_instruction_hasstartercode |
|
|
else: |
|
|
question += "\n\n" + code_instruction_nostartercode |
|
|
|
|
|
final_prompt = instruction + "<|im_start|>user\n" + question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
qid_list.append(item['question_id']) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_livecodebench_raw(data_file): |
|
|
"""Preprocess LiveCodeBench dataset with raw formatting. |
|
|
|
|
|
Args: |
|
|
data_file: Path to LiveCodeBench JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Raw coding prompts |
|
|
- qid_list: Question IDs |
|
|
""" |
|
|
code_instruction_nostartercode = """Write Python code to solve the problem. Please place the solution code in the following format:\n```python\n# Your solution code here\n```""" |
|
|
code_instruction_hasstartercode = """Please place the solution code in the following format:\n```python\n# Your solution code here\n```""" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
qid_list = [] |
|
|
for item in data_list: |
|
|
question = item['question_content'].strip() |
|
|
if item['starter_code'] != "": |
|
|
question += "\n\n" + "Solve the problem starting with the provided function header.\n\nFunction header:\n" + "```\n" + \ |
|
|
item['starter_code'] + "\n```" |
|
|
question += "\n\n" + code_instruction_hasstartercode |
|
|
else: |
|
|
question += "\n\n" + code_instruction_nostartercode |
|
|
|
|
|
final_prompt = question |
|
|
|
|
|
prompt_list.append(final_prompt) |
|
|
qid_list.append(item['question_id']) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mbpp_chatml_template(data_file): |
|
|
"""Preprocess MBPP (Mostly Basic Python Problems) dataset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MBPP JSON file |
|
|
|
|
|
Returns: |
|
|
tuple: (prompt_list, qid_list) |
|
|
- prompt_list: Formatted code generation prompts with ChatML template |
|
|
- qid_list: Task IDs |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_dict = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
qid_list = [] |
|
|
|
|
|
for key, value in data_dict.items(): |
|
|
qid_list.append(key) |
|
|
question = value.get('text', value.get('prompt', '')) |
|
|
final_prompt = instruction + "<|im_start|>user\n" + question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list, qid_list |
|
|
|
|
|
|
|
|
def preprocess_mmlu_stem_chatml_template(data_file): |
|
|
"""Preprocess MMLU STEM subset with ChatML template. |
|
|
|
|
|
Args: |
|
|
data_file: Path to MMLU STEM JSON file |
|
|
|
|
|
Returns: |
|
|
list: Formatted prompts with ChatML template |
|
|
""" |
|
|
instruction = "<|im_start|>system\nYou are a helpful and harmless assistant. You should think step-by-step.<|im_end|>\n" |
|
|
|
|
|
with open(data_file, "r") as f: |
|
|
data_list = json.load(f) |
|
|
|
|
|
prompt_list = [] |
|
|
for item in data_list: |
|
|
question = item['question'].strip() |
|
|
choices = item.get('choices', []) |
|
|
|
|
|
final_question = question + "\n" |
|
|
if choices: |
|
|
for i, choice in enumerate(choices): |
|
|
final_question += f"({chr(65+i)}) {choice}\n" |
|
|
|
|
|
final_prompt = instruction + "<|im_start|>user\n" + final_question + "<|im_end|>\n<|im_start|>assistant\n<think>\n" |
|
|
prompt_list.append(final_prompt) |
|
|
|
|
|
return prompt_list |
|
|
|
|
|
|