| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | Preprocess the nq dataset to parquet format |
| | """ |
| |
|
| | import re |
| | import os |
| | import datasets |
| |
|
| | from verl.utils.hdfs_io import copy, makedirs |
| | import argparse |
| |
|
| |
|
| | def make_prefix(dp, template_type): |
| | question = dp['question'] |
| |
|
| | |
| | if template_type == 'base': |
| | """This works for any base model""" |
| | prefix = f"""Answer the given question. \ |
| | You should first have a reasoning process in mind and then provides the answer. \ |
| | Show your reasoning in <think> </think> tags and return the final answer in <answer> </answer> tags, for example <answer> Beijing </answer>. \ |
| | Question: {question}\n""" |
| | else: |
| | raise NotImplementedError |
| | return prefix |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument('--local_dir', default='./data/nq') |
| | parser.add_argument('--hdfs_dir', default=None) |
| | parser.add_argument('--template_type', type=str, default='base') |
| |
|
| | args = parser.parse_args() |
| |
|
| | data_source = 'nq' |
| |
|
| | dataset = datasets.load_dataset('RUC-NLPIR/FlashRAG_datasets', 'nq') |
| |
|
| | train_dataset = dataset['train'] |
| | test_dataset = dataset['test'] |
| |
|
| | |
| | def make_map_fn(split): |
| |
|
| | def process_fn(example, idx): |
| | example['question'] = example['question'].strip() |
| | if example['question'][-1] != '?': |
| | example['question'] += '?' |
| | question = make_prefix(example, template_type=args.template_type) |
| | solution = { |
| | "target": example['golden_answers'], |
| | } |
| |
|
| | data = { |
| | "data_source": data_source, |
| | "prompt": [{ |
| | "role": "user", |
| | "content": question, |
| | }], |
| | "ability": "fact-reasoning", |
| | "reward_model": { |
| | "style": "rule", |
| | "ground_truth": solution |
| | }, |
| | "extra_info": { |
| | 'split': split, |
| | 'index': idx, |
| | } |
| | } |
| | return data |
| |
|
| | return process_fn |
| |
|
| | train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) |
| | test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) |
| |
|
| | local_dir = args.local_dir |
| | hdfs_dir = args.hdfs_dir |
| |
|
| | train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) |
| | test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) |
| |
|
| | if hdfs_dir is not None: |
| | makedirs(hdfs_dir) |
| |
|
| | copy(src=local_dir, dst=hdfs_dir) |
| |
|