| import os |
| import json |
| import datasets |
| import argparse |
| from verl.utils.hdfs_io import copy, makedirs |
|
|
| |
| |
| with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f: |
| PROMPT_TEMPLATE = f.read() |
|
|
| def make_map_fn(split, data_source): |
| def process_fn(example, idx): |
| |
| full_text = example.pop('fulltext') |
| gold_summary = example.pop('summary') |
| |
| |
| |
| prompt_content = PROMPT_TEMPLATE.format( |
| source_lang="English", |
| gold_summary=gold_summary, |
| full_text=full_text |
| ) |
|
|
| return { |
| "data_source": data_source, |
| "prompt": [{ |
| "role": "user", |
| "content": prompt_content |
| }], |
| "ability": "summarization", |
| "reward_model": { |
| "style": "rule", |
| "ground_truth": gold_summary |
| }, |
| "extra_info": { |
| "split": split, |
| "index": idx, |
| "original_id": example.get('id', idx) |
| } |
| } |
| return process_fn |
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json') |
| |
| parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset') |
| args = parser.parse_args() |
|
|
| data_source = 'multiclinsum' |
|
|
| |
| with open(args.input_path, 'r') as f: |
| raw_data = json.load(f) |
| |
| |
| dataset = datasets.Dataset.from_list(raw_data) |
|
|
| |
| split_dataset = dataset.train_test_split(test_size=0.05, seed=42) |
|
|
| |
| processed_train = split_dataset["train"].map( |
| function=make_map_fn('train', data_source), |
| with_indices=True |
| ) |
| processed_test = split_dataset["test"].map( |
| function=make_map_fn('test', data_source), |
| with_indices=True |
| ) |
|
|
| |
| os.makedirs(args.local_dir, exist_ok=True) |
|
|
| |
| train_output_path = os.path.join(args.local_dir, 'train.parquet') |
| test_output_path = os.path.join(args.local_dir, 'test.parquet') |
| processed_train.to_parquet(train_output_path) |
| processed_test.to_parquet(test_output_path) |
|
|
| print(f"--- Dataset Preparation Complete ---") |
| print(f"Train file saved to: {train_output_path}") |
| print(f"Test file saved to: {test_output_path}") |
| print(f"Total train records: {len(processed_train)}") |
| print(f"Total test records: {len(processed_test)}") |