| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ |
| Preprocess the nq dataset to parquet format |
| """ |
|
|
| import re |
| import os |
| import datasets |
|
|
| from verl.utils.hdfs_io import copy, makedirs |
| import argparse |
|
|
|
|
| def make_prefix(dp, template_type): |
| question = dp['question'] |
|
|
| |
| if template_type == 'base': |
| """This works for any base model""" |
| prefix = f"""Answer the given question. \ |
| You should first have a reasoning process in mind and then provides the answer. \ |
| Show your reasoning in <think> </think> tags and return the final answer in <answer> </answer> tags, for example <answer> Beijing </answer>. \ |
| Question: {question}\n""" |
| else: |
| raise NotImplementedError |
| return prefix |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--local_dir', default='./data/nq') |
| parser.add_argument('--hdfs_dir', default=None) |
| parser.add_argument('--template_type', type=str, default='base') |
|
|
| args = parser.parse_args() |
|
|
| data_source = 'nq' |
|
|
| dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq') |
|
|
| train_dataset = dataset['train'] |
| test_dataset = dataset['test'] |
|
|
| |
| def make_map_fn(split): |
|
|
| def process_fn(example, idx): |
| example['question'] = example['question'].strip() |
| if example['question'][-1] != '?': |
| example['question'] += '?' |
| question = make_prefix(example, template_type=args.template_type) |
| solution = { |
| "target": example['golden_answers'], |
| } |
|
|
| data = { |
| "data_source": data_source, |
| "prompt": [{ |
| "role": "user", |
| "content": question, |
| }], |
| "ability": "fact-reasoning", |
| "reward_model": { |
| "style": "rule", |
| "ground_truth": solution |
| }, |
| "extra_info": { |
| 'split': split, |
| 'index': idx, |
| } |
| } |
| return data |
|
|
| return process_fn |
|
|
| train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) |
| test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) |
|
|
| local_dir = args.local_dir |
| hdfs_dir = args.hdfs_dir |
|
|
| train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) |
| test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) |
|
|
| if hdfs_dir is not None: |
| makedirs(hdfs_dir) |
|
|
| copy(src=local_dir, dst=hdfs_dir) |
|
|