import os import json import datasets import argparse from verl.utils.hdfs_io import copy, makedirs # 1. Define the exact Prompt Template from your requirements # /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f: PROMPT_TEMPLATE = f.read() def make_map_fn(split, data_source): def process_fn(example, idx): # Extract fields from your specific JSON keys: ['id', 'fulltext', 'summary'] full_text = example.pop('fulltext') gold_summary = example.pop('summary') # Format the prompt using your template # Note: Added 'English' as default source lang based on filename prompt_content = PROMPT_TEMPLATE.format( source_lang="English", gold_summary=gold_summary, full_text=full_text ) return { "data_source": data_source, "prompt": [{ "role": "user", "content": prompt_content }], "ability": "summarization", "reward_model": { "style": "rule", "ground_truth": gold_summary }, "extra_info": { "split": split, "index": idx, "original_id": example.get('id', idx) } } return process_fn if __name__ == '__main__': parser = argparse.ArgumentParser() # Path to your input JSON parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json') # Updated destination as requested parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset') args = parser.parse_args() data_source = 'multiclinsum' # Load your local JSON file with open(args.input_path, 'r') as f: raw_data = json.load(f) # Convert to HuggingFace Dataset dataset = datasets.Dataset.from_list(raw_data) # Split into train/test (95% train, 5% test) split_dataset = dataset.train_test_split(test_size=0.05, seed=42) # Apply the mapping transformation for each split processed_train = split_dataset["train"].map( function=make_map_fn('train', data_source), with_indices=True ) processed_test = split_dataset["test"].map( function=make_map_fn('test', data_source), with_indices=True ) # Create the directory if it doesn't exist os.makedirs(args.local_dir, exist_ok=True) # Save to Parquet in the specified location train_output_path = os.path.join(args.local_dir, 'train.parquet') test_output_path = os.path.join(args.local_dir, 'test.parquet') processed_train.to_parquet(train_output_path) processed_test.to_parquet(test_output_path) print(f"--- Dataset Preparation Complete ---") print(f"Train file saved to: {train_output_path}") print(f"Test file saved to: {test_output_path}") print(f"Total train records: {len(processed_train)}") print(f"Total test records: {len(processed_test)}")