| | import os |
| | import json |
| | import datasets |
| | import argparse |
| | from verl.utils.hdfs_io import copy, makedirs |
| |
|
| | |
| | |
| | with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f: |
| | PROMPT_TEMPLATE = f.read() |
| |
|
| | def make_map_fn(split, data_source): |
| | def process_fn(example, idx): |
| | |
| | full_text = example.pop('fulltext') |
| | gold_summary = example.pop('summary') |
| | fulltext_subclaims = example.pop('fulltext_subclaims', None) |
| | summary_subclaims = example.pop('summary_subclaims', None) |
| | |
| | |
| | |
| | prompt_content = PROMPT_TEMPLATE.format( |
| | source_lang="English", |
| | gold_summary=gold_summary, |
| | full_text=full_text |
| | ) |
| |
|
| | return { |
| | "data_source": data_source, |
| | "prompt": [{ |
| | "role": "user", |
| | "content": prompt_content |
| | }], |
| | "ability": "summarization", |
| | "reward_model": { |
| | "style": "rule", |
| | "ground_truth": { |
| | "summary_subclaims": summary_subclaims, |
| | "fulltext_subclaims": fulltext_subclaims |
| | } |
| | }, |
| | "extra_info": { |
| | "split": split, |
| | "index": idx, |
| | "original_id": example.get('id', idx), |
| | "fulltext_subclaims": fulltext_subclaims, |
| | "summary_subclaims": summary_subclaims |
| | } |
| | } |
| | return process_fn |
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser() |
| | |
| | parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json') |
| | |
| | parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset') |
| | args = parser.parse_args() |
| |
|
| | data_source = 'multiclinsum' |
| |
|
| | |
| | with open(args.input_path, 'r') as f: |
| | raw_data = json.load(f) |
| | |
| | |
| | dataset = datasets.Dataset.from_list(raw_data) |
| |
|
| | |
| | dataset_split = dataset.train_test_split(test_size=0.05, seed=42, shuffle=True) |
| |
|
| | |
| | processed_train = dataset_split['train'].map( |
| | function=make_map_fn('train', data_source), |
| | with_indices=True |
| | ) |
| | processed_test = dataset_split['test'].map( |
| | function=make_map_fn('test', data_source), |
| | with_indices=True |
| | ) |
| |
|
| | |
| | os.makedirs(args.local_dir, exist_ok=True) |
| |
|
| | |
| | train_output_path = os.path.join(args.local_dir, 'train.parquet') |
| | test_output_path = os.path.join(args.local_dir, 'test.parquet') |
| | processed_train.to_parquet(train_output_path) |
| | processed_test.to_parquet(test_output_path) |
| |
|
| | print(f"--- Dataset Preparation Complete ---") |
| | print(f"Train file saved to: {train_output_path}") |
| | print(f"Test file saved to: {test_output_path}") |
| | print(f"Total train records: {len(processed_train)}") |
| | print(f"Total test records: {len(processed_test)}") |